Files
dano2025/preanalysis/05_exploratory_models.ipynb
2025-12-12 20:19:59 +03:00

450 lines
79 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "ef83309f",
"metadata": {},
"source": [
"# 05. Эксплориторные модели и гипотезы\n",
"\n",
"Цели: построить простые модели прогнозирования наличия заказа, оценить важность признаков, собрать таблицу статистических гипотез."
]
},
{
"cell_type": "code",
"id": "55cfab4e",
"metadata": {
"ExecuteTime": {
"end_time": "2025-12-05T18:35:44.710487Z",
"start_time": "2025-12-05T18:35:33.975533Z"
}
},
"source": [
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from pathlib import Path\n",
"from eda_utils import (\n",
" load_data, DATA_PATH, CATEGORIES, ACTIVE_IMP_COLS, PASSIVE_IMP_COLS,\n",
" ACTIVE_CLICK_COLS, PASSIVE_CLICK_COLS, ORDER_COLS, NUMERIC_COLS, CAT_COLS,\n",
" describe_zero_share, safe_divide, build_daily, build_client, add_contact_density\n",
")\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.options.display.float_format = '{:,.3f}'.format\n",
"sns.set_theme(style=\"ticks\", palette=\"deep\")\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report, roc_auc_score\n",
"from sklearn.impute import SimpleImputer\n",
"from scipy import stats\n",
"\n",
"df = load_data()\n",
"client = build_client(df)\n"
],
"outputs": [],
"execution_count": 1
},
{
"cell_type": "code",
"id": "d4f620fe",
"metadata": {
"ExecuteTime": {
"end_time": "2025-12-05T18:35:44.721273Z",
"start_time": "2025-12-05T18:35:44.714476Z"
}
},
"source": [
"# Подготовка данных для модели\n",
"target = client['has_any_order']\n",
"num_features = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ['age', 'contact_days', 'avg_impressions_per_contact_day', 'active_ctr', 'passive_ctr', 'ctr_all']\n",
"cat_features = ['gender_cd', 'device_platform_cd', 'age_group']\n",
"X = client[num_features + cat_features]\n",
"\n",
"preprocess = ColumnTransformer([\n",
" ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), num_features),\n",
" ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),\n",
"])\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42, stratify=target)\n"
],
"outputs": [],
"execution_count": 2
},
{
"cell_type": "code",
"id": "31ad015b",
"metadata": {
"ExecuteTime": {
"end_time": "2025-12-05T18:35:46.234607Z",
"start_time": "2025-12-05T18:35:44.727554Z"
}
},
"source": [
"log_reg = Pipeline([\n",
" ('preprocess', preprocess),\n",
" ('model', LogisticRegression(max_iter=500, n_jobs=-1)),\n",
"])\n",
"log_reg.fit(X_train, y_train)\n",
"preds = log_reg.predict(X_test)\n",
"proba = log_reg.predict_proba(X_test)[:, 1]\n",
"print(classification_report(y_test, preds))\n",
"print('ROC-AUC', roc_auc_score(y_test, proba))\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.63 0.60 0.61 726\n",
" 1 0.70 0.73 0.71 942\n",
"\n",
" accuracy 0.67 1668\n",
" macro avg 0.66 0.66 0.66 1668\n",
"weighted avg 0.67 0.67 0.67 1668\n",
"\n",
"ROC-AUC 0.7244009288016237\n"
]
}
],
"execution_count": 3
},
{
"cell_type": "code",
"id": "6815f1d0",
"metadata": {
"ExecuteTime": {
"end_time": "2025-12-05T18:35:46.309611Z",
"start_time": "2025-12-05T18:35:46.302180Z"
}
},
"source": [
"model = log_reg.named_steps['model']\n",
"num_names = list(log_reg.named_steps['preprocess'].transformers_[0][2])\n",
"cat_encoder = log_reg.named_steps['preprocess'].named_transformers_['cat']\n",
"cat_names = list(cat_encoder.get_feature_names_out(cat_features))\n",
"feature_names = num_names + cat_names\n",
"coef = pd.DataFrame({'feature': feature_names, 'coef': model.coef_.flatten()})\n",
"coef['odds_ratio'] = np.exp(coef['coef'])\n",
"coef.sort_values('odds_ratio', ascending=False).head(20)\n"
],
"outputs": [
{
"data": {
"text/plain": [
" feature coef odds_ratio\n",
"20 passive_click_transport 0.433 1.542\n",
"18 passive_click_ent 0.426 1.531\n",
"0 active_imp_ent 0.424 1.528\n",
"27 active_ctr 0.399 1.491\n",
"38 age_group_55+ 0.355 1.426\n",
"14 active_click_transport 0.336 1.399\n",
"2 active_imp_transport 0.322 1.380\n",
"22 passive_click_hotel 0.239 1.270\n",
"6 passive_imp_ent 0.230 1.258\n",
"3 active_imp_shopping 0.224 1.251\n",
"32 device_platform_cd_Android 0.213 1.237\n",
"23 passive_click_avia 0.203 1.225\n",
"4 active_imp_hotel 0.189 1.208\n",
"36 age_group_35-44 0.176 1.192\n",
"1 active_imp_super 0.172 1.188\n",
"19 passive_click_super 0.144 1.155\n",
"5 active_imp_avia 0.120 1.128\n",
"28 passive_ctr 0.116 1.123\n",
"33 device_platform_cd_iOS 0.084 1.087\n",
"9 passive_imp_shopping 0.079 1.082"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feature</th>\n",
" <th>coef</th>\n",
" <th>odds_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>passive_click_transport</td>\n",
" <td>0.433</td>\n",
" <td>1.542</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>passive_click_ent</td>\n",
" <td>0.426</td>\n",
" <td>1.531</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>active_imp_ent</td>\n",
" <td>0.424</td>\n",
" <td>1.528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>active_ctr</td>\n",
" <td>0.399</td>\n",
" <td>1.491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>age_group_55+</td>\n",
" <td>0.355</td>\n",
" <td>1.426</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>active_click_transport</td>\n",
" <td>0.336</td>\n",
" <td>1.399</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>active_imp_transport</td>\n",
" <td>0.322</td>\n",
" <td>1.380</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>passive_click_hotel</td>\n",
" <td>0.239</td>\n",
" <td>1.270</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>passive_imp_ent</td>\n",
" <td>0.230</td>\n",
" <td>1.258</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>active_imp_shopping</td>\n",
" <td>0.224</td>\n",
" <td>1.251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>device_platform_cd_Android</td>\n",
" <td>0.213</td>\n",
" <td>1.237</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>passive_click_avia</td>\n",
" <td>0.203</td>\n",
" <td>1.225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>active_imp_hotel</td>\n",
" <td>0.189</td>\n",
" <td>1.208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>age_group_35-44</td>\n",
" <td>0.176</td>\n",
" <td>1.192</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>active_imp_super</td>\n",
" <td>0.172</td>\n",
" <td>1.188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>passive_click_super</td>\n",
" <td>0.144</td>\n",
" <td>1.155</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>active_imp_avia</td>\n",
" <td>0.120</td>\n",
" <td>1.128</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>passive_ctr</td>\n",
" <td>0.116</td>\n",
" <td>1.123</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>device_platform_cd_iOS</td>\n",
" <td>0.084</td>\n",
" <td>1.087</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>passive_imp_shopping</td>\n",
" <td>0.079</td>\n",
" <td>1.082</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
},
{
"cell_type": "code",
"id": "1da69077",
"metadata": {
"ExecuteTime": {
"end_time": "2025-12-05T18:35:46.851634Z",
"start_time": "2025-12-05T18:35:46.333923Z"
}
},
"source": [
"rf = Pipeline([\n",
" ('preprocess', preprocess),\n",
" ('model', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)),\n",
"])\n",
"rf.fit(X_train, y_train)\n",
"rf_model = rf.named_steps['model']\n",
"rf_features = feature_names\n",
"importances = pd.DataFrame({'feature': rf_features, 'importance': rf_model.feature_importances_}).sort_values('importance', ascending=False)\n",
"plt.figure(figsize=(10, 6))\n",
"sns.barplot(data=importances.head(20), x='importance', y='feature')\n",
"plt.title('Feature importance (RandomForest)')\n",
"plt.tight_layout()\n"
],
"outputs": [
{
"data": {
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
],
"image/png": ""
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
}
],
"execution_count": 5
},
{
"cell_type": "code",
"id": "4481d690",
"metadata": {
"ExecuteTime": {
"end_time": "2025-12-05T18:35:46.893437Z",
"start_time": "2025-12-05T18:35:46.875542Z"
}
},
"source": [
"hypotheses = []\n",
"stat, p = stats.mannwhitneyu(df['active_ctr'].dropna(), df['passive_ctr'].dropna(), alternative='greater')\n",
"hypotheses.append({'hypothesis': 'CTR_active > CTR_passive', 'test': 'Mann-Whitney', 'pvalue': p})\n",
"m_ctr = client.loc[client['gender_cd'] == 'M', 'ctr_all'].dropna()\n",
"f_ctr = client.loc[client['gender_cd'] == 'F', 'ctr_all'].dropna()\n",
"stat, p = stats.mannwhitneyu(m_ctr, f_ctr, alternative='two-sided')\n",
"hypotheses.append({'hypothesis': 'CTR отличается по полу', 'test': 'Mann-Whitney', 'pvalue': p})\n",
"hypo_df = pd.DataFrame(hypotheses)\n",
"hypo_df\n"
],
"outputs": [
{
"data": {
"text/plain": [
" hypothesis test pvalue\n",
"0 CTR_active > CTR_passive Mann-Whitney 0.000\n",
"1 CTR отличается по полу Mann-Whitney 0.277"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>hypothesis</th>\n",
" <th>test</th>\n",
" <th>pvalue</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CTR_active &gt; CTR_passive</td>\n",
" <td>Mann-Whitney</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CTR отличается по полу</td>\n",
" <td>Mann-Whitney</td>\n",
" <td>0.277</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 6
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3 (ipykernel)"
}
},
"nbformat": 4,
"nbformat_minor": 5
}