fully working spam hypot

2025-12-12 23:27:23 +03:00
parent ce595182b9
commit c5c10d1fcf
19 changed files with 39 additions and 3882 deletions
--- a/spam_hypot/01_stat_analysis.ipynb
+++ b/spam_hypot/01_stat_analysis.ipynb
@@ -1,188 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "4d7d3347",
-   "metadata": {},
-   "source": [
-    "# Спам-гипотеза: плотность показов vs CTR/CR\n",
-    "\n",
-    "Цель: проверить, что высокая плотность показов на контактный день снижает CTR и CR (спам-эффект)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7acbd1c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sqlite3\n",
-    "from pathlib import Path\n",
-    "import sys\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt\n",
-    "from scipy import stats\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
-    "from sklearn.compose import ColumnTransformer\n",
-    "from sklearn.pipeline import Pipeline\n",
-    "from sklearn.impute import SimpleImputer\n",
-    "from sklearn.metrics import roc_auc_score\n",
-    "\n",
-    "sns.set_theme(style=\"whitegrid\")\n",
-    "plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
-    "\n",
-    "project_root = Path.cwd().resolve()\n",
-    "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
-    "    project_root = project_root.parent\n",
-    "sys.path.append(str(project_root / \"preanalysis\"))\n",
-    "import eda_utils as eda\n",
-    "\n",
-    "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
-    "conn = sqlite3.connect(db_path)\n",
-    "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
-    "conn.close()\n",
-    "\n",
-    "for cols, name in [\n",
-    "    (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
-    "    (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
-    "    (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
-    "    (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
-    "    (eda.ORDER_COLS, \"orders_amt_total\"),\n",
-    "]:\n",
-    "    df[name] = df[cols].sum(axis=1)\n",
-    "\n",
-    "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
-    "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
-    "\n",
-    "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
-    "client = df.groupby(\"id\").agg(\n",
-    "    {\n",
-    "        \"imp_total\": \"sum\",\n",
-    "        \"click_total\": \"sum\",\n",
-    "        \"orders_amt_total\": \"sum\",\n",
-    "        \"age\": \"median\",\n",
-    "        \"gender_cd\": lambda s: s.mode().iat[0],\n",
-    "        \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
-    "    }\n",
-    ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
-    "\n",
-    "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
-    "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
-    "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
-    "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
-    "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "94eb2d26",
-   "metadata": {},
-   "source": [
-    "## Базовые статистики"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "287a09b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "summary = client[[\"imp_total\", \"click_total\", \"orders_amt_total\", \"contact_days\", \"avg_imp_per_day\", \"ctr_all\", \"cr_click2order\"]].describe().T\n",
-    "missing = client.isna().mean().sort_values(ascending=False)\n",
-    "summary, missing.head(10)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "10cd44b7",
-   "metadata": {},
-   "source": [
-    "## Корреляции и тесты\n",
-    "Спирмен между плотностью и CTR/CR, а также Mann–Whitney между Q1 и Q4 по плотности."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "88714a03",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "corr_ctr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"ctr_all\"])\n",
-    "corr_cr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"cr_click2order\"])\n",
-    "q1 = client[\"avg_imp_per_day\"].quantile(0.25)\n",
-    "q4 = client[\"avg_imp_per_day\"].quantile(0.75)\n",
-    "low = client.loc[client[\"avg_imp_per_day\"] <= q1, \"ctr_all\"].dropna()\n",
-    "high = client.loc[client[\"avg_imp_per_day\"] >= q4, \"ctr_all\"].dropna()\n",
-    "wu = stats.mannwhitneyu(low, high, alternative=\"greater\")\n",
-    "{ \"spearman_ctr\": corr_ctr, \"spearman_cr\": corr_cr, \"mw_low_gt_high\": wu }\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "20d492fa",
-   "metadata": {},
-   "source": [
-    "bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
-    "stats_bin = client.groupby(bins, observed=False).agg(\n",
-    "    ctr_all=(\"ctr_all\", \"median\"),\n",
-    "    cr_click2order=(\"cr_click2order\", \"median\"),\n",
-    "    avg_imp_per_day=(\"avg_imp_per_day\", \"median\"),\n",
-    ").reset_index()\n",
-    "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
-    "fig, ax1 = plt.subplots(figsize=(12, 5))\n",
-    "ax2 = ax1.twinx()\n",
-    "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
-    "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
-    "ax1.set_ylabel(\"CTR\")\n",
-    "ax2.set_ylabel(\"CR click→order\")\n",
-    "plt.xticks(rotation=35)\n",
-    "ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
-    "fig.tight_layout()\n",
-    "plt.show()\n",
-    "stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "943f0d4b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
-    "stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
-    "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
-    "fig, ax1 = plt.subplots(figsize=(12, 5))\n",
-    "ax2 = ax1.twinx()\n",
-    "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
-    "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
-    "ax1.set_ylabel(\"CTR\")\n",
-    "ax2.set_ylabel(\"CR click→order\")\n",
-    "plt.xticks(rotation=35)\n",
-    "ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
-    "fig.tight_layout()\n",
-    "plt.show()\n",
-    "stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/spam_hypot/02_models.ipynb
+++ b/spam_hypot/02_models.ipynb
@@ -1,161 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "7254b4c1",
-   "metadata": {},
-   "source": [
-    "# Спам-гипотеза: сравнение моделей\n",
-    "\n",
-    "Target: `high_ctr` (верхний квартиль CTR)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c7f54168",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sqlite3\n",
-    "from pathlib import Path\n",
-    "import sys\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt\n",
-    "from scipy import stats\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
-    "from sklearn.compose import ColumnTransformer\n",
-    "from sklearn.pipeline import Pipeline\n",
-    "from sklearn.impute import SimpleImputer\n",
-    "from sklearn.metrics import roc_auc_score\n",
-    "\n",
-    "sns.set_theme(style=\"whitegrid\")\n",
-    "plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
-    "\n",
-    "project_root = Path.cwd().resolve()\n",
-    "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
-    "    project_root = project_root.parent\n",
-    "sys.path.append(str(project_root / \"preanalysis\"))\n",
-    "import eda_utils as eda\n",
-    "\n",
-    "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
-    "conn = sqlite3.connect(db_path)\n",
-    "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
-    "conn.close()\n",
-    "\n",
-    "for cols, name in [\n",
-    "    (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
-    "    (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
-    "    (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
-    "    (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
-    "    (eda.ORDER_COLS, \"orders_amt_total\"),\n",
-    "]:\n",
-    "    df[name] = df[cols].sum(axis=1)\n",
-    "\n",
-    "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
-    "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
-    "\n",
-    "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
-    "client = df.groupby(\"id\").agg(\n",
-    "    {\n",
-    "        \"imp_total\": \"sum\",\n",
-    "        \"click_total\": \"sum\",\n",
-    "        \"orders_amt_total\": \"sum\",\n",
-    "        \"age\": \"median\",\n",
-    "        \"gender_cd\": lambda s: s.mode().iat[0],\n",
-    "        \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
-    "    }\n",
-    ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
-    "\n",
-    "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
-    "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
-    "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
-    "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
-    "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "21786c63",
-   "metadata": {},
-   "source": [
-    "## Модели: Logistic Regression vs GradientBoosting"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dc8dbc94",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
-    "X = X.copy()\n",
-    "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
-    "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
-    "y = client[\"high_ctr\"]\n",
-    "\n",
-    "num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n",
-    "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
-    "pre = ColumnTransformer([\n",
-    "    (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n",
-    "    (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
-    "])\n",
-    "\n",
-    "log_reg = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\n",
-    "gb = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n",
-    "\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
-    "res = {}\n",
-    "for name, model in [(\"log_reg\", log_reg), (\"gb\", gb)]:\n",
-    "    model.fit(X_train, y_train)\n",
-    "    proba = model.predict_proba(X_test)[:, 1]\n",
-    "    res[name] = roc_auc_score(y_test, proba)\n",
-    "res\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "203acf70",
-   "metadata": {},
-   "source": [
-    "## Важности признаков (GradientBoosting)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3eac9e17",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gb_model = gb\n",
-    "feat_names = gb_model.named_steps[\"pre\"].get_feature_names_out()\n",
-    "importances = gb_model.named_steps[\"clf\"].feature_importances_\n",
-    "imp_df = pd.DataFrame({\"feature\": feat_names, \"importance\": importances}).sort_values(\"importance\", ascending=False)\n",
-    "plt.figure(figsize=(8, 5))\n",
-    "sns.barplot(data=imp_df.head(15), x=\"importance\", y=\"feature\", palette=\"viridis\")\n",
-    "plt.title(\"Top-15 feature importances (GB)\")\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n",
-    "imp_df.head(15)\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/spam_hypot/03_best_model.ipynb
+++ b/spam_hypot/03_best_model.ipynb
@@ -1,206 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d88bf2d8",
-   "metadata": {},
-   "source": [
-    "# Спам-гипотеза: лучшая модель и визуализации\n",
-    "\n",
-    "Используем GradientBoostingClassifier (лучше логрега по AUC) для подтверждения гипотезы."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "87f3f728",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sqlite3\n",
-    "from pathlib import Path\n",
-    "import sys\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt\n",
-    "from scipy import stats\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
-    "from sklearn.compose import ColumnTransformer\n",
-    "from sklearn.pipeline import Pipeline\n",
-    "from sklearn.impute import SimpleImputer\n",
-    "from sklearn.metrics import roc_auc_score\n",
-    "\n",
-    "sns.set_theme(style=\"whitegrid\")\n",
-    "plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
-    "\n",
-    "project_root = Path.cwd().resolve()\n",
-    "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
-    "    project_root = project_root.parent\n",
-    "sys.path.append(str(project_root / \"preanalysis\"))\n",
-    "import eda_utils as eda\n",
-    "\n",
-    "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
-    "conn = sqlite3.connect(db_path)\n",
-    "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
-    "conn.close()\n",
-    "\n",
-    "for cols, name in [\n",
-    "    (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
-    "    (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
-    "    (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
-    "    (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
-    "    (eda.ORDER_COLS, \"orders_amt_total\"),\n",
-    "]:\n",
-    "    df[name] = df[cols].sum(axis=1)\n",
-    "\n",
-    "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
-    "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
-    "\n",
-    "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
-    "client = df.groupby(\"id\").agg(\n",
-    "    {\n",
-    "        \"imp_total\": \"sum\",\n",
-    "        \"click_total\": \"sum\",\n",
-    "        \"orders_amt_total\": \"sum\",\n",
-    "        \"age\": \"median\",\n",
-    "        \"gender_cd\": lambda s: s.mode().iat[0],\n",
-    "        \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
-    "    }\n",
-    ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
-    "\n",
-    "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
-    "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
-    "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
-    "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
-    "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "17da010c",
-   "metadata": {},
-   "source": [
-    "## Обучение лучшей модели"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "81433d7e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
-    "X = X.copy()\n",
-    "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
-    "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
-    "y = client[\"high_ctr\"]\n",
-    "\n",
-    "num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n",
-    "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
-    "pre = ColumnTransformer([\n",
-    "    (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n",
-    "    (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
-    "])\n",
-    "\n",
-    "best = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
-    "best.fit(X_train, y_train)\n",
-    "proba = best.predict_proba(X_test)[:, 1]\n",
-    "auc = roc_auc_score(y_test, proba)\n",
-    "auc\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "63f4db9b",
-   "metadata": {},
-   "source": [
-    "## Прогноз vs плотность показов"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f48584b5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "grid = pd.DataFrame({\"avg_imp_per_day\": np.linspace(client[\"avg_imp_per_day\"].min(), client[\"avg_imp_per_day\"].max(), 50)})\n",
-    "base = client.median(numeric_only=True)\n",
-    "base_gender = client[\"gender_cd\"].mode().iat[0]\n",
-    "base_device = client[\"device_platform_cd\"].mode().iat[0]\n",
-    "grid[\"imp_total\"] = base[\"imp_total\"]\n",
-    "grid[\"click_total\"] = base[\"click_total\"]\n",
-    "grid[\"age\"] = base[\"age\"]\n",
-    "grid[\"gender_cd\"] = base_gender\n",
-    "grid[\"device_platform_cd\"] = base_device\n",
-    "proba_grid = best.predict_proba(grid)[:, 1]\n",
-    "plt.figure(figsize=(10, 4))\n",
-    "plt.plot(grid[\"avg_imp_per_day\"], proba_grid, marker=\"o\")\n",
-    "plt.xlabel(\"avg_imp_per_day\")\n",
-    "plt.ylabel(\"P(high CTR)\")\n",
-    "plt.title(\"Предсказанная вероятность высокого CTR vs плотность показов\")\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "32f73b44",
-   "metadata": {},
-   "source": [
-    "## График CTR и CR по тонким бинам (две оси)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bb4d0190",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bins = pd.qcut(client[\"avg_imp_per_day\"], 15, duplicates=\"drop\")\n",
-    "stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
-    "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
-    "fig, ax1 = plt.subplots(figsize=(12, 5))\n",
-    "ax2 = ax1.twinx()\n",
-    "ax1.plot(stats_bin[\"bin_label\"], stats_bin[\"ctr_all\"], marker=\"o\", color=\"#4c72b0\", label=\"CTR\")\n",
-    "ax2.plot(stats_bin[\"bin_label\"], stats_bin[\"cr_click2order\"], marker=\"s\", color=\"#c44e52\", label=\"CR\")\n",
-    "ax1.set_ylabel(\"CTR\")\n",
-    "ax2.set_ylabel(\"CR click→order\")\n",
-    "ax1.set_xlabel(\"avg_imp_per_day bins\")\n",
-    "plt.xticks(rotation=35)\n",
-    "ax1.set_title(\"CTR и CR по 15 бинам avg_imp_per_day\")\n",
-    "fig.tight_layout()\n",
-    "plt.show()\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ebb2ca5e",
-   "metadata": {},
-   "source": [
-    "## Вывод\n",
-    "- AUC модели GradientBoosting > логрега; `avg_imp_per_day` ключевой драйвер: рост плотности снижает шанс попасть в верхний квартиль CTR.\n",
-    "- Биновые графики показывают монотонное падение CTR и CR при росте avg_imp_per_day.\n",
-    "- Гипотеза о спам-эффекте подтверждается как статистически, так и по ML-модели."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/spam_hypot/best_bins.png
+++ b/spam_hypot/best_bins.png
--- a/spam_hypot/best_model_prob.png
+++ b/spam_hypot/best_model_prob.png
--- a/spam_hypot/model_compare.py
+++ b/spam_hypot/model_compare.py
@@ -46,32 +46,62 @@ client = (
    .merge(contact_days, on="id", how="left")
    .reset_index()
 )
+# ... всё как у тебя до расчёта client["ctr_all"] включительно
+
 client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
 client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
-client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)

-X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
-X = X.copy()
-X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
-X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
-y = client["high_ctr"]
+# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
+train_idx, test_idx = train_test_split(
+    client.index, test_size=0.2, random_state=42
+)

-num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
+train = client.loc[train_idx].copy()
+test = client.loc[test_idx].copy()
+
+thr = train["ctr_all"].quantile(0.75)   # порог только по train
+train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
+test["high_ctr"]  = (test["ctr_all"]  >= thr).astype(int)
+
+# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
+X_train = train[[
+    "avg_imp_per_day", "imp_total", "contact_days",  # можно оставить
+    "age", "gender_cd", "device_platform_cd"
+]].copy()
+X_test = test[[
+    "avg_imp_per_day", "imp_total", "contact_days",
+    "age", "gender_cd", "device_platform_cd"
+]].copy()
+
+X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
+X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
+X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
+X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
+
+y_train = train["high_ctr"]
+y_test = test["high_ctr"]
+
+num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
 cat_cols = ["gender_cd", "device_platform_cd"]
+
 pre = ColumnTransformer([
-    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
+    ("num", Pipeline([
+        ("imputer", SimpleImputer(strategy="median")),
+        ("scaler", StandardScaler())
+    ]), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
 ])

 log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
 gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])

-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
 results = {}
 for name, model in [("log_reg", log_reg), ("gb", gb)]:
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    results[name] = roc_auc_score(y_test, proba)
+
+print("CTR threshold (train 0.75q):", thr)
 print("AUC results:", results)

 imp = gb.named_steps["clf"].feature_importances_
--- a/spam_hypot/stat_bins.png
+++ b/spam_hypot/stat_bins.png