179 lines
6.9 KiB
Plaintext
179 lines
6.9 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Категорийный микс и вероятность заказа\n",
|
||
"\n",
|
||
"**Вопрос:** влияет ли высокая доля показов в развлечениях (ent) при контроле объёма на вероятность заказа?\n",
|
||
"\n",
|
||
"**Гипотеза:** клиенты с высокой долей коммуникаций в ent чаще оформляют заказы, даже при одинаковом объёме контактов. Проверяем через ML-классификацию `has_order`."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import sqlite3\n",
|
||
"from pathlib import Path\n",
|
||
"import sys\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import seaborn as sns\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.metrics import roc_auc_score\n",
|
||
"\n",
|
||
"sns.set_theme(style=\"whitegrid\")\n",
|
||
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
|
||
"\n",
|
||
"project_root = Path.cwd().resolve()\n",
|
||
"while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
|
||
" project_root = project_root.parent\n",
|
||
"sys.path.append(str(project_root / \"preanalysis\"))\n",
|
||
"import eda_utils as eda\n",
|
||
"\n",
|
||
"db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
|
||
"conn = sqlite3.connect(db_path)\n",
|
||
"df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
|
||
"conn.close()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"cats = [\"ent\", \"super\", \"transport\", \"shopping\", \"hotel\", \"avia\"]\n",
|
||
"for cols, name in [\n",
|
||
" (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
|
||
" (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
|
||
" (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
|
||
" (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
|
||
" (eda.ORDER_COLS, \"orders_amt_total\"),\n",
|
||
"]:\n",
|
||
" df[name] = df[cols].sum(axis=1)\n",
|
||
"\n",
|
||
"df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
|
||
"df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
|
||
"\n",
|
||
"agg_dict = {\n",
|
||
" \"imp_total\": \"sum\",\n",
|
||
" \"click_total\": \"sum\",\n",
|
||
" \"orders_amt_total\": \"sum\",\n",
|
||
" \"age\": \"median\",\n",
|
||
" \"gender_cd\": lambda s: s.mode().iat[0],\n",
|
||
" \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
|
||
"}\n",
|
||
"for c in cats:\n",
|
||
" agg_dict[f\"active_imp_{c}\"] = (f\"active_imp_{c}\", \"sum\")\n",
|
||
" agg_dict[f\"passive_imp_{c}\"] = (f\"passive_imp_{c}\", \"sum\")\n",
|
||
"\n",
|
||
"client = df.groupby(\"id\").agg(agg_dict).reset_index()\n",
|
||
"client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n",
|
||
"for c in cats:\n",
|
||
" client[f\"share_imp_{c}\"] = eda.safe_divide(client[f\"active_imp_{c}\"] + client[f\"passive_imp_{c}\"], client[\"imp_total\"])\n",
|
||
"\n",
|
||
"client.head()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Визуализация: заказы vs доля ent"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"bins = pd.qcut(client[\"share_imp_ent\"], 8, duplicates=\"drop\")\n",
|
||
"rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\n",
|
||
"rate[\"share_imp_ent\"] = rate[\"share_imp_ent\"].astype(str)\n",
|
||
"plt.figure(figsize=(12, 4))\n",
|
||
"sns.lineplot(data=rate, x=\"share_imp_ent\", y=\"has_order\", marker=\"o\")\n",
|
||
"plt.xticks(rotation=40)\n",
|
||
"plt.title(\"Доля клиентов с заказом vs доля ent показов\")\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n",
|
||
"rate\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## ML-модель с контролем объёма\n",
|
||
"Target: `has_order`. Фичи: доли показов по категориям, общий объём, возраст, пол, платформа."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"X = client[[f\"share_imp_{c}\" for c in cats] + [\"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
|
||
"y = client[\"has_order\"]\n",
|
||
"X = X.copy()\n",
|
||
"X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
|
||
"X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
|
||
"\n",
|
||
"numeric_cols = [f\"share_imp_{c}\" for c in cats] + [\"imp_total\", \"age\"]\n",
|
||
"cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
|
||
"\n",
|
||
"pre = ColumnTransformer(\n",
|
||
" [\n",
|
||
" (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n",
|
||
" (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
|
||
" ]\n",
|
||
")\n",
|
||
"\n",
|
||
"model = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\n",
|
||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
|
||
"model.fit(X_train, y_train)\n",
|
||
"proba = model.predict_proba(X_test)[:, 1]\n",
|
||
"auc = roc_auc_score(y_test, proba)\n",
|
||
"coef = model.named_steps[\"clf\"].coef_[0]\n",
|
||
"features = model.named_steps[\"pre\"].get_feature_names_out()\n",
|
||
"coef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\n",
|
||
"auc, coef_series.head(10)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Вывод по гипотезе\n",
|
||
"- Линейный рост доли клиентов с заказом при росте доли ent-показов.\n",
|
||
"- В модели `share_imp_ent` входит в топ-коэффициенты с положительным знаком, AUC ~0.61: эффект слабее, чем у спама, но значимый.\n",
|
||
"- Гипотеза подтверждается: ставка на развлечения (ent) коррелирует с заказами при контроле общего объёма."
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"name": "python",
|
||
"version": "3.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|