Files
dano2025/alternative/ent_passive_ctr_uplift/analysis.ipynb
2025-12-12 23:17:56 +03:00

113 lines
23 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Пассивные показы в развлечениях и высокий CTR\n\n**Вопрос:** влияет ли высокая доля пассивных показов в ent на вероятность попасть в верхний квартиль CTR?\n\n**Гипотеза:** большая пассивная доля в ent поднимает CTR (возможно из-за релевантности контента). Проверяем через ML-классификацию `high_ctr`."
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2025-12-12T19:27:39.950563Z",
"start_time": "2025-12-12T19:27:39.023085Z"
}
},
"source": [
"import sqlite3\nfrom pathlib import Path\nimport sys\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import roc_auc_score\n\nsns.set_theme(style=\"whitegrid\")\nplt.rcParams[\"figure.figsize\"] = (10, 5)\n\nproject_root = Path.cwd().resolve()\nwhile not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n project_root = project_root.parent\nsys.path.append(str(project_root / \"preanalysis\"))\nimport eda_utils as eda\n\ndb_path = project_root / \"dataset\" / \"ds.sqlite\"\nconn = sqlite3.connect(db_path)\ndf = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\nconn.close()\n"
],
"outputs": [],
"execution_count": 3
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2025-12-12T19:27:40.126928Z",
"start_time": "2025-12-12T19:27:39.955172Z"
}
},
"source": [
"for cols, name in [\n (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n]:\n df[name] = df[cols].sum(axis=1)\n\ndf[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\ndf[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n\nclient = df.groupby(\"id\").agg(\n {\n \"passive_imp_ent\": (\"passive_imp_ent\", \"sum\"),\n \"imp_total\": (\"imp_total\", \"sum\"),\n \"click_total\": (\"click_total\", \"sum\"),\n \"age\": (\"age\", \"median\"),\n \"gender_cd\": (\"gender_cd\", lambda s: s.mode().iat[0]),\n \"device_platform_cd\": (\"device_platform_cd\", lambda s: s.mode().iat[0]),\n }\n).reset_index()\n\nclient[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\nclient[\"passive_ent_share\"] = eda.safe_divide(client[\"passive_imp_ent\"], client[\"imp_total\"])\nclient[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\nclient.head()\n"
],
"outputs": [
{
"ename": "AttributeError",
"evalue": "'SeriesGroupBy' object has no attribute 'passive_imp_ent'",
"output_type": "error",
"traceback": [
"\u001B[31m---------------------------------------------------------------------------\u001B[39m",
"\u001B[31mAttributeError\u001B[39m Traceback (most recent call last)",
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[4]\u001B[39m\u001B[32m, line 12\u001B[39m\n\u001B[32m 9\u001B[39m df[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m] = df[\u001B[33m\"\u001B[39m\u001B[33mactive_imp_total\u001B[39m\u001B[33m\"\u001B[39m] + df[\u001B[33m\"\u001B[39m\u001B[33mpassive_imp_total\u001B[39m\u001B[33m\"\u001B[39m]\n\u001B[32m 10\u001B[39m df[\u001B[33m\"\u001B[39m\u001B[33mclick_total\u001B[39m\u001B[33m\"\u001B[39m] = df[\u001B[33m\"\u001B[39m\u001B[33mactive_click_total\u001B[39m\u001B[33m\"\u001B[39m] + df[\u001B[33m\"\u001B[39m\u001B[33mpassive_click_total\u001B[39m\u001B[33m\"\u001B[39m]\n\u001B[32m---> \u001B[39m\u001B[32m12\u001B[39m client = \u001B[43mdf\u001B[49m\u001B[43m.\u001B[49m\u001B[43mgroupby\u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mid\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43magg\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 13\u001B[39m \u001B[43m \u001B[49m\u001B[43m{\u001B[49m\n\u001B[32m 14\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mpassive_imp_ent\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mpassive_imp_ent\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 15\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mimp_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mimp_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 16\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mclick_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mclick_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 17\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mage\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mage\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mmedian\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 18\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mgender_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mgender_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mlambda\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmode\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43miat\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 19\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mdevice_platform_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mdevice_platform_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mlambda\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmode\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43miat\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 20\u001B[39m \u001B[43m \u001B[49m\u001B[43m}\u001B[49m\n\u001B[32m 21\u001B[39m \u001B[43m)\u001B[49m.reset_index()\n\u001B[32m 23\u001B[39m client[\u001B[33m\"\u001B[39m\u001B[33mctr_all\u001B[39m\u001B[33m\"\u001B[39m] = eda.safe_divide(client[\u001B[33m\"\u001B[39m\u001B[33mclick_total\u001B[39m\u001B[33m\"\u001B[39m], client[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m])\n\u001B[32m 24\u001B[39m client[\u001B[33m\"\u001B[39m\u001B[33mpassive_ent_share\u001B[39m\u001B[33m\"\u001B[39m] = eda.safe_divide(client[\u001B[33m\"\u001B[39m\u001B[33mpassive_imp_ent\u001B[39m\u001B[33m\"\u001B[39m], client[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m])\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:1432\u001B[39m, in \u001B[36mDataFrameGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 1429\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m 1431\u001B[39m op = GroupByApply(\u001B[38;5;28mself\u001B[39m, func, args=args, kwargs=kwargs)\n\u001B[32m-> \u001B[39m\u001B[32m1432\u001B[39m result = \u001B[43mop\u001B[49m\u001B[43m.\u001B[49m\u001B[43magg\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 1433\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_dict_like(func) \u001B[38;5;129;01mand\u001B[39;00m result \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 1434\u001B[39m \u001B[38;5;66;03m# GH #52849\u001B[39;00m\n\u001B[32m 1435\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mself\u001B[39m.as_index \u001B[38;5;129;01mand\u001B[39;00m is_list_like(func):\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:190\u001B[39m, in \u001B[36mApply.agg\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 187\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.apply_str()\n\u001B[32m 189\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m is_dict_like(func):\n\u001B[32m--> \u001B[39m\u001B[32m190\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43magg_dict_like\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 191\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m is_list_like(func):\n\u001B[32m 192\u001B[39m \u001B[38;5;66;03m# we require a list, but not a 'str'\u001B[39;00m\n\u001B[32m 193\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.agg_list_like()\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:423\u001B[39m, in \u001B[36mApply.agg_dict_like\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 415\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34magg_dict_like\u001B[39m(\u001B[38;5;28mself\u001B[39m) -> DataFrame | Series:\n\u001B[32m 416\u001B[39m \u001B[38;5;250m \u001B[39m\u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 417\u001B[39m \u001B[33;03m Compute aggregation in the case of a dict-like argument.\u001B[39;00m\n\u001B[32m 418\u001B[39m \n\u001B[32m (...)\u001B[39m\u001B[32m 421\u001B[39m \u001B[33;03m Result of aggregation.\u001B[39;00m\n\u001B[32m 422\u001B[39m \u001B[33;03m \"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m423\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43magg_or_apply_dict_like\u001B[49m\u001B[43m(\u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m=\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43magg\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:1603\u001B[39m, in \u001B[36mGroupByApply.agg_or_apply_dict_like\u001B[39m\u001B[34m(self, op_name)\u001B[39m\n\u001B[32m 1598\u001B[39m kwargs.update({\u001B[33m\"\u001B[39m\u001B[33mengine\u001B[39m\u001B[33m\"\u001B[39m: engine, \u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m: engine_kwargs})\n\u001B[32m 1600\u001B[39m \u001B[38;5;28;01mwith\u001B[39;00m com.temp_setattr(\n\u001B[32m 1601\u001B[39m obj, \u001B[33m\"\u001B[39m\u001B[33mas_index\u001B[39m\u001B[33m\"\u001B[39m, \u001B[38;5;28;01mTrue\u001B[39;00m, condition=\u001B[38;5;28mhasattr\u001B[39m(obj, \u001B[33m\"\u001B[39m\u001B[33mas_index\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 1602\u001B[39m ):\n\u001B[32m-> \u001B[39m\u001B[32m1603\u001B[39m result_index, result_data = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mcompute_dict_like\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 1604\u001B[39m \u001B[43m \u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mselected_obj\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mselection\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkwargs\u001B[49m\n\u001B[32m 1605\u001B[39m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 1606\u001B[39m result = \u001B[38;5;28mself\u001B[39m.wrap_results_dict_like(selected_obj, result_index, result_data)\n\u001B[32m 1607\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:497\u001B[39m, in \u001B[36mApply.compute_dict_like\u001B[39m\u001B[34m(self, op_name, selected_obj, selection, kwargs)\u001B[39m\n\u001B[32m 493\u001B[39m results += key_data\n\u001B[32m 494\u001B[39m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[32m 495\u001B[39m \u001B[38;5;66;03m# key used for column selection and output\u001B[39;00m\n\u001B[32m 496\u001B[39m results = [\n\u001B[32m--> \u001B[39m\u001B[32m497\u001B[39m \u001B[38;5;28;43mgetattr\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mobj\u001B[49m\u001B[43m.\u001B[49m\u001B[43m_gotitem\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mndim\u001B[49m\u001B[43m=\u001B[49m\u001B[32;43m1\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m(\u001B[49m\u001B[43mhow\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 498\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m key, how \u001B[38;5;129;01min\u001B[39;00m func.items()\n\u001B[32m 499\u001B[39m ]\n\u001B[32m 500\u001B[39m keys = \u001B[38;5;28mlist\u001B[39m(func.keys())\n\u001B[32m 502\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m keys, results\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:257\u001B[39m, in \u001B[36mSeriesGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 255\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine\u001B[39m\u001B[33m\"\u001B[39m] = engine\n\u001B[32m 256\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m--> \u001B[39m\u001B[32m257\u001B[39m ret = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_aggregate_multiple_funcs\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 258\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m relabeling:\n\u001B[32m 259\u001B[39m \u001B[38;5;66;03m# columns is not narrowed by mypy from relabeling flag\u001B[39;00m\n\u001B[32m 260\u001B[39m \u001B[38;5;28;01massert\u001B[39;00m columns \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;66;03m# for mypy\u001B[39;00m\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:362\u001B[39m, in \u001B[36mSeriesGroupBy._aggregate_multiple_funcs\u001B[39m\u001B[34m(self, arg, *args, **kwargs)\u001B[39m\n\u001B[32m 360\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m idx, (name, func) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(arg):\n\u001B[32m 361\u001B[39m key = base.OutputKey(label=name, position=idx)\n\u001B[32m--> \u001B[39m\u001B[32m362\u001B[39m results[key] = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43maggregate\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 364\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, DataFrame) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m results.values()):\n\u001B[32m 365\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mpandas\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m concat\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:249\u001B[39m, in \u001B[36mSeriesGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 247\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m engine_kwargs \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 248\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m--> \u001B[39m\u001B[32m249\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mgetattr\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m)\u001B[49m(*args, **kwargs)\n\u001B[32m 251\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(func, abc.Iterable):\n\u001B[32m 252\u001B[39m \u001B[38;5;66;03m# Catch instances of lists / tuples\u001B[39;00m\n\u001B[32m 253\u001B[39m \u001B[38;5;66;03m# but not the class list / tuple itself.\u001B[39;00m\n\u001B[32m 254\u001B[39m func = maybe_mangle_lambdas(func)\n",
"\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/groupby.py:1365\u001B[39m, in \u001B[36mGroupBy.__getattr__\u001B[39m\u001B[34m(self, attr)\u001B[39m\n\u001B[32m 1362\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m attr \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m.obj:\n\u001B[32m 1363\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m[attr]\n\u001B[32m-> \u001B[39m\u001B[32m1365\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mAttributeError\u001B[39;00m(\n\u001B[32m 1366\u001B[39m \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mtype\u001B[39m(\u001B[38;5;28mself\u001B[39m).\u001B[34m__name__\u001B[39m\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m object has no attribute \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mattr\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 1367\u001B[39m )\n",
"\u001B[31mAttributeError\u001B[39m: 'SeriesGroupBy' object has no attribute 'passive_imp_ent'"
]
}
],
"execution_count": 4
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Визуализация: доля пассивных ent vs CTR"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bins = pd.qcut(client[\"passive_ent_share\"], 8, duplicates=\"drop\")\nmed = client.groupby(bins)[\"ctr_all\"].median().reset_index()\nmed[\"passive_ent_share\"] = med[\"passive_ent_share\"].astype(str)\nplt.figure(figsize=(12, 4))\nsns.lineplot(data=med, x=\"passive_ent_share\", y=\"ctr_all\", marker=\"o\")\nplt.xticks(rotation=40)\nplt.title(\"CTR vs доля пассивных ent показов\")\nplt.tight_layout()\nplt.show()\nmed\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ML-модель на high CTR"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X = client[[\"passive_ent_share\", \"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\ny = client[\"high_ctr\"]\nX = X.copy()\nX[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\nX[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n\nnumeric_cols = [\"passive_ent_share\", \"imp_total\", \"age\"]\ncat_cols = [\"gender_cd\", \"device_platform_cd\"]\n\npre = ColumnTransformer(\n [\n (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n ]\n)\n\nmodel = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\nmodel.fit(X_train, y_train)\nproba = model.predict_proba(X_test)[:, 1]\nauc = roc_auc_score(y_test, proba)\ncoef = model.named_steps[\"clf\"].coef_[0]\nfeatures = model.named_steps[\"pre\"].get_feature_names_out()\ncoef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\nauc, coef_series.head(10)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Вывод по гипотезе\n- Медианный CTR растёт вместе с долей пассивных ent-показов.\n- В модели `passive_ent_share` — топ-фича с положительным знаком, AUC ~0.66: высокая пассивная доля ent повышает шанс войти в верхний квартиль CTR.\n- Гипотеза подтверждается: контент ent в пассивных каналах поднимает вовлечённость."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}