{ "cells": [ { "cell_type": "markdown", "id": "d88bf2d8", "metadata": {}, "source": [ "# Спам-гипотеза: лучшая модель и визуализации\n", "\n", "Используем GradientBoostingClassifier (лучше логрега по AUC) для подтверждения гипотезы." ] }, { "cell_type": "code", "execution_count": null, "id": "87f3f728", "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "from pathlib import Path\n", "import sys\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from scipy import stats\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.metrics import roc_auc_score\n", "\n", "sns.set_theme(style=\"whitegrid\")\n", "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", "\n", "project_root = Path.cwd().resolve()\n", "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", " project_root = project_root.parent\n", "sys.path.append(str(project_root / \"preanalysis\"))\n", "import eda_utils as eda\n", "\n", "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", "conn = sqlite3.connect(db_path)\n", "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", "conn.close()\n", "\n", "for cols, name in [\n", " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", " (eda.ORDER_COLS, \"orders_amt_total\"),\n", "]:\n", " df[name] = df[cols].sum(axis=1)\n", "\n", "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", "\n", "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n", "client = df.groupby(\"id\").agg(\n", " {\n", " \"imp_total\": \"sum\",\n", " \"click_total\": \"sum\",\n", " \"orders_amt_total\": \"sum\",\n", " \"age\": \"median\",\n", " \"gender_cd\": lambda s: s.mode().iat[0],\n", " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", " }\n", ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n", "\n", "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n", "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n", "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n", "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n", "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n" ] }, { "cell_type": "markdown", "id": "17da010c", "metadata": {}, "source": [ "## Обучение лучшей модели" ] }, { "cell_type": "code", "execution_count": null, "id": "81433d7e", "metadata": {}, "outputs": [], "source": [ "X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n", "X = X.copy()\n", "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n", "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n", "y = client[\"high_ctr\"]\n", "\n", "num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n", "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n", "pre = ColumnTransformer([\n", " (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n", " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", "])\n", "\n", "best = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", "best.fit(X_train, y_train)\n", "proba = best.predict_proba(X_test)[:, 1]\n", "auc = roc_auc_score(y_test, proba)\n", "auc\n" ] }, { "cell_type": "markdown", "id": "63f4db9b", "metadata": {}, "source": [ "## Прогноз vs плотность показов" ] }, { "cell_type": "code", "execution_count": null, "id": "f48584b5", "metadata": {}, "outputs": [], "source": [ "grid = pd.DataFrame({\"avg_imp_per_day\": np.linspace(client[\"avg_imp_per_day\"].min(), client[\"avg_imp_per_day\"].max(), 50)})\n", "base = client.median(numeric_only=True)\n", "base_gender = client[\"gender_cd\"].mode().iat[0]\n", "base_device = client[\"device_platform_cd\"].mode().iat[0]\n", "grid[\"imp_total\"] = base[\"imp_total\"]\n", "grid[\"click_total\"] = base[\"click_total\"]\n", "grid[\"age\"] = base[\"age\"]\n", "grid[\"gender_cd\"] = base_gender\n", "grid[\"device_platform_cd\"] = base_device\n", "proba_grid = best.predict_proba(grid)[:, 1]\n", "plt.figure(figsize=(10, 4))\n", "plt.plot(grid[\"avg_imp_per_day\"], proba_grid, marker=\"o\")\n", "plt.xlabel(\"avg_imp_per_day\")\n", "plt.ylabel(\"P(high CTR)\")\n", "plt.title(\"Предсказанная вероятность высокого CTR vs плотность показов\")\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "id": "32f73b44", "metadata": {}, "source": [ "## График CTR и CR по тонким бинам (две оси)" ] }, { "cell_type": "code", "execution_count": null, "id": "bb4d0190", "metadata": {}, "outputs": [], "source": [ "bins = pd.qcut(client[\"avg_imp_per_day\"], 15, duplicates=\"drop\")\n", "stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n", "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n", "fig, ax1 = plt.subplots(figsize=(12, 5))\n", "ax2 = ax1.twinx()\n", "ax1.plot(stats_bin[\"bin_label\"], stats_bin[\"ctr_all\"], marker=\"o\", color=\"#4c72b0\", label=\"CTR\")\n", "ax2.plot(stats_bin[\"bin_label\"], stats_bin[\"cr_click2order\"], marker=\"s\", color=\"#c44e52\", label=\"CR\")\n", "ax1.set_ylabel(\"CTR\")\n", "ax2.set_ylabel(\"CR click→order\")\n", "ax1.set_xlabel(\"avg_imp_per_day bins\")\n", "plt.xticks(rotation=35)\n", "ax1.set_title(\"CTR и CR по 15 бинам avg_imp_per_day\")\n", "fig.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "id": "ebb2ca5e", "metadata": {}, "source": [ "## Вывод\n", "- AUC модели GradientBoosting > логрега; `avg_imp_per_day` ключевой драйвер: рост плотности снижает шанс попасть в верхний квартиль CTR.\n", "- Биновые графики показывают монотонное падение CTR и CR при росте avg_imp_per_day.\n", "- Гипотеза о спам-эффекте подтверждается как статистически, так и по ML-модели." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.13" } }, "nbformat": 4, "nbformat_minor": 5 }