162 lines
5.8 KiB
Plaintext
162 lines
5.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7254b4c1",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Спам-гипотеза: сравнение моделей\n",
|
|
"\n",
|
|
"Target: `high_ctr` (верхний квартиль CTR)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c7f54168",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import sqlite3\n",
|
|
"from pathlib import Path\n",
|
|
"import sys\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import seaborn as sns\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"from scipy import stats\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.impute import SimpleImputer\n",
|
|
"from sklearn.metrics import roc_auc_score\n",
|
|
"\n",
|
|
"sns.set_theme(style=\"whitegrid\")\n",
|
|
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
|
|
"\n",
|
|
"project_root = Path.cwd().resolve()\n",
|
|
"while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
|
|
" project_root = project_root.parent\n",
|
|
"sys.path.append(str(project_root / \"preanalysis\"))\n",
|
|
"import eda_utils as eda\n",
|
|
"\n",
|
|
"db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
|
|
"conn = sqlite3.connect(db_path)\n",
|
|
"df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
|
|
"conn.close()\n",
|
|
"\n",
|
|
"for cols, name in [\n",
|
|
" (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
|
|
" (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
|
|
" (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
|
|
" (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
|
|
" (eda.ORDER_COLS, \"orders_amt_total\"),\n",
|
|
"]:\n",
|
|
" df[name] = df[cols].sum(axis=1)\n",
|
|
"\n",
|
|
"df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
|
|
"df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
|
|
"\n",
|
|
"contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
|
|
"client = df.groupby(\"id\").agg(\n",
|
|
" {\n",
|
|
" \"imp_total\": \"sum\",\n",
|
|
" \"click_total\": \"sum\",\n",
|
|
" \"orders_amt_total\": \"sum\",\n",
|
|
" \"age\": \"median\",\n",
|
|
" \"gender_cd\": lambda s: s.mode().iat[0],\n",
|
|
" \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
|
|
" }\n",
|
|
").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
|
|
"\n",
|
|
"client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
|
|
"client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
|
|
"client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
|
|
"client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
|
|
"client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "21786c63",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Модели: Logistic Regression vs GradientBoosting"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "dc8dbc94",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
|
|
"X = X.copy()\n",
|
|
"X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
|
|
"X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
|
|
"y = client[\"high_ctr\"]\n",
|
|
"\n",
|
|
"num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n",
|
|
"cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
|
|
"pre = ColumnTransformer([\n",
|
|
" (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n",
|
|
" (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
|
|
"])\n",
|
|
"\n",
|
|
"log_reg = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\n",
|
|
"gb = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n",
|
|
"\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
|
|
"res = {}\n",
|
|
"for name, model in [(\"log_reg\", log_reg), (\"gb\", gb)]:\n",
|
|
" model.fit(X_train, y_train)\n",
|
|
" proba = model.predict_proba(X_test)[:, 1]\n",
|
|
" res[name] = roc_auc_score(y_test, proba)\n",
|
|
"res\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "203acf70",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Важности признаков (GradientBoosting)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3eac9e17",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"gb_model = gb\n",
|
|
"feat_names = gb_model.named_steps[\"pre\"].get_feature_names_out()\n",
|
|
"importances = gb_model.named_steps[\"clf\"].feature_importances_\n",
|
|
"imp_df = pd.DataFrame({\"feature\": feat_names, \"importance\": importances}).sort_values(\"importance\", ascending=False)\n",
|
|
"plt.figure(figsize=(8, 5))\n",
|
|
"sns.barplot(data=imp_df.head(15), x=\"importance\", y=\"feature\", palette=\"viridis\")\n",
|
|
"plt.title(\"Top-15 feature importances (GB)\")\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"imp_df.head(15)\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python",
|
|
"version": "3.13"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|