{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Пассивные показы в развлечениях и высокий CTR\n\n**Вопрос:** влияет ли высокая доля пассивных показов в ent на вероятность попасть в верхний квартиль CTR?\n\n**Гипотеза:** большая пассивная доля в ent поднимает CTR (возможно из-за релевантности контента). Проверяем через ML-классификацию `high_ctr`." ] }, { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2025-12-12T19:27:39.950563Z", "start_time": "2025-12-12T19:27:39.023085Z" } }, "source": [ "import sqlite3\nfrom pathlib import Path\nimport sys\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import roc_auc_score\n\nsns.set_theme(style=\"whitegrid\")\nplt.rcParams[\"figure.figsize\"] = (10, 5)\n\nproject_root = Path.cwd().resolve()\nwhile not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n project_root = project_root.parent\nsys.path.append(str(project_root / \"preanalysis\"))\nimport eda_utils as eda\n\ndb_path = project_root / \"dataset\" / \"ds.sqlite\"\nconn = sqlite3.connect(db_path)\ndf = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\nconn.close()\n" ], "outputs": [], "execution_count": 3 }, { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2025-12-12T19:27:40.126928Z", "start_time": "2025-12-12T19:27:39.955172Z" } }, "source": [ "for cols, name in [\n (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n]:\n df[name] = df[cols].sum(axis=1)\n\ndf[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\ndf[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n\nclient = df.groupby(\"id\").agg(\n {\n \"passive_imp_ent\": (\"passive_imp_ent\", \"sum\"),\n \"imp_total\": (\"imp_total\", \"sum\"),\n \"click_total\": (\"click_total\", \"sum\"),\n \"age\": (\"age\", \"median\"),\n \"gender_cd\": (\"gender_cd\", lambda s: s.mode().iat[0]),\n \"device_platform_cd\": (\"device_platform_cd\", lambda s: s.mode().iat[0]),\n }\n).reset_index()\n\nclient[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\nclient[\"passive_ent_share\"] = eda.safe_divide(client[\"passive_imp_ent\"], client[\"imp_total\"])\nclient[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\nclient.head()\n" ], "outputs": [ { "ename": "AttributeError", "evalue": "'SeriesGroupBy' object has no attribute 'passive_imp_ent'", "output_type": "error", "traceback": [ "\u001B[31m---------------------------------------------------------------------------\u001B[39m", "\u001B[31mAttributeError\u001B[39m Traceback (most recent call last)", "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[4]\u001B[39m\u001B[32m, line 12\u001B[39m\n\u001B[32m 9\u001B[39m df[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m] = df[\u001B[33m\"\u001B[39m\u001B[33mactive_imp_total\u001B[39m\u001B[33m\"\u001B[39m] + df[\u001B[33m\"\u001B[39m\u001B[33mpassive_imp_total\u001B[39m\u001B[33m\"\u001B[39m]\n\u001B[32m 10\u001B[39m df[\u001B[33m\"\u001B[39m\u001B[33mclick_total\u001B[39m\u001B[33m\"\u001B[39m] = df[\u001B[33m\"\u001B[39m\u001B[33mactive_click_total\u001B[39m\u001B[33m\"\u001B[39m] + df[\u001B[33m\"\u001B[39m\u001B[33mpassive_click_total\u001B[39m\u001B[33m\"\u001B[39m]\n\u001B[32m---> \u001B[39m\u001B[32m12\u001B[39m client = \u001B[43mdf\u001B[49m\u001B[43m.\u001B[49m\u001B[43mgroupby\u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mid\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43magg\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 13\u001B[39m \u001B[43m \u001B[49m\u001B[43m{\u001B[49m\n\u001B[32m 14\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mpassive_imp_ent\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mpassive_imp_ent\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 15\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mimp_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mimp_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 16\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mclick_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mclick_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 17\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mage\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mage\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mmedian\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 18\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mgender_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mgender_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mlambda\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmode\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43miat\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 19\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mdevice_platform_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mdevice_platform_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mlambda\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmode\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43miat\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 20\u001B[39m \u001B[43m \u001B[49m\u001B[43m}\u001B[49m\n\u001B[32m 21\u001B[39m \u001B[43m)\u001B[49m.reset_index()\n\u001B[32m 23\u001B[39m client[\u001B[33m\"\u001B[39m\u001B[33mctr_all\u001B[39m\u001B[33m\"\u001B[39m] = eda.safe_divide(client[\u001B[33m\"\u001B[39m\u001B[33mclick_total\u001B[39m\u001B[33m\"\u001B[39m], client[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m])\n\u001B[32m 24\u001B[39m client[\u001B[33m\"\u001B[39m\u001B[33mpassive_ent_share\u001B[39m\u001B[33m\"\u001B[39m] = eda.safe_divide(client[\u001B[33m\"\u001B[39m\u001B[33mpassive_imp_ent\u001B[39m\u001B[33m\"\u001B[39m], client[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m])\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:1432\u001B[39m, in \u001B[36mDataFrameGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 1429\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m 1431\u001B[39m op = GroupByApply(\u001B[38;5;28mself\u001B[39m, func, args=args, kwargs=kwargs)\n\u001B[32m-> \u001B[39m\u001B[32m1432\u001B[39m result = \u001B[43mop\u001B[49m\u001B[43m.\u001B[49m\u001B[43magg\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 1433\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_dict_like(func) \u001B[38;5;129;01mand\u001B[39;00m result \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 1434\u001B[39m \u001B[38;5;66;03m# GH #52849\u001B[39;00m\n\u001B[32m 1435\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mself\u001B[39m.as_index \u001B[38;5;129;01mand\u001B[39;00m is_list_like(func):\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:190\u001B[39m, in \u001B[36mApply.agg\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 187\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.apply_str()\n\u001B[32m 189\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m is_dict_like(func):\n\u001B[32m--> \u001B[39m\u001B[32m190\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43magg_dict_like\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 191\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m is_list_like(func):\n\u001B[32m 192\u001B[39m \u001B[38;5;66;03m# we require a list, but not a 'str'\u001B[39;00m\n\u001B[32m 193\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.agg_list_like()\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:423\u001B[39m, in \u001B[36mApply.agg_dict_like\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 415\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34magg_dict_like\u001B[39m(\u001B[38;5;28mself\u001B[39m) -> DataFrame | Series:\n\u001B[32m 416\u001B[39m \u001B[38;5;250m \u001B[39m\u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 417\u001B[39m \u001B[33;03m Compute aggregation in the case of a dict-like argument.\u001B[39;00m\n\u001B[32m 418\u001B[39m \n\u001B[32m (...)\u001B[39m\u001B[32m 421\u001B[39m \u001B[33;03m Result of aggregation.\u001B[39;00m\n\u001B[32m 422\u001B[39m \u001B[33;03m \"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m423\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43magg_or_apply_dict_like\u001B[49m\u001B[43m(\u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m=\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43magg\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:1603\u001B[39m, in \u001B[36mGroupByApply.agg_or_apply_dict_like\u001B[39m\u001B[34m(self, op_name)\u001B[39m\n\u001B[32m 1598\u001B[39m kwargs.update({\u001B[33m\"\u001B[39m\u001B[33mengine\u001B[39m\u001B[33m\"\u001B[39m: engine, \u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m: engine_kwargs})\n\u001B[32m 1600\u001B[39m \u001B[38;5;28;01mwith\u001B[39;00m com.temp_setattr(\n\u001B[32m 1601\u001B[39m obj, \u001B[33m\"\u001B[39m\u001B[33mas_index\u001B[39m\u001B[33m\"\u001B[39m, \u001B[38;5;28;01mTrue\u001B[39;00m, condition=\u001B[38;5;28mhasattr\u001B[39m(obj, \u001B[33m\"\u001B[39m\u001B[33mas_index\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 1602\u001B[39m ):\n\u001B[32m-> \u001B[39m\u001B[32m1603\u001B[39m result_index, result_data = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mcompute_dict_like\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 1604\u001B[39m \u001B[43m \u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mselected_obj\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mselection\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkwargs\u001B[49m\n\u001B[32m 1605\u001B[39m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 1606\u001B[39m result = \u001B[38;5;28mself\u001B[39m.wrap_results_dict_like(selected_obj, result_index, result_data)\n\u001B[32m 1607\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:497\u001B[39m, in \u001B[36mApply.compute_dict_like\u001B[39m\u001B[34m(self, op_name, selected_obj, selection, kwargs)\u001B[39m\n\u001B[32m 493\u001B[39m results += key_data\n\u001B[32m 494\u001B[39m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[32m 495\u001B[39m \u001B[38;5;66;03m# key used for column selection and output\u001B[39;00m\n\u001B[32m 496\u001B[39m results = [\n\u001B[32m--> \u001B[39m\u001B[32m497\u001B[39m \u001B[38;5;28;43mgetattr\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mobj\u001B[49m\u001B[43m.\u001B[49m\u001B[43m_gotitem\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mndim\u001B[49m\u001B[43m=\u001B[49m\u001B[32;43m1\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m(\u001B[49m\u001B[43mhow\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 498\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m key, how \u001B[38;5;129;01min\u001B[39;00m func.items()\n\u001B[32m 499\u001B[39m ]\n\u001B[32m 500\u001B[39m keys = \u001B[38;5;28mlist\u001B[39m(func.keys())\n\u001B[32m 502\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m keys, results\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:257\u001B[39m, in \u001B[36mSeriesGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 255\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine\u001B[39m\u001B[33m\"\u001B[39m] = engine\n\u001B[32m 256\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m--> \u001B[39m\u001B[32m257\u001B[39m ret = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_aggregate_multiple_funcs\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 258\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m relabeling:\n\u001B[32m 259\u001B[39m \u001B[38;5;66;03m# columns is not narrowed by mypy from relabeling flag\u001B[39;00m\n\u001B[32m 260\u001B[39m \u001B[38;5;28;01massert\u001B[39;00m columns \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;66;03m# for mypy\u001B[39;00m\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:362\u001B[39m, in \u001B[36mSeriesGroupBy._aggregate_multiple_funcs\u001B[39m\u001B[34m(self, arg, *args, **kwargs)\u001B[39m\n\u001B[32m 360\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m idx, (name, func) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(arg):\n\u001B[32m 361\u001B[39m key = base.OutputKey(label=name, position=idx)\n\u001B[32m--> \u001B[39m\u001B[32m362\u001B[39m results[key] = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43maggregate\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 364\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, DataFrame) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m results.values()):\n\u001B[32m 365\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mpandas\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m concat\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:249\u001B[39m, in \u001B[36mSeriesGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 247\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m engine_kwargs \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 248\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m--> \u001B[39m\u001B[32m249\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mgetattr\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m)\u001B[49m(*args, **kwargs)\n\u001B[32m 251\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(func, abc.Iterable):\n\u001B[32m 252\u001B[39m \u001B[38;5;66;03m# Catch instances of lists / tuples\u001B[39;00m\n\u001B[32m 253\u001B[39m \u001B[38;5;66;03m# but not the class list / tuple itself.\u001B[39;00m\n\u001B[32m 254\u001B[39m func = maybe_mangle_lambdas(func)\n", "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/groupby.py:1365\u001B[39m, in \u001B[36mGroupBy.__getattr__\u001B[39m\u001B[34m(self, attr)\u001B[39m\n\u001B[32m 1362\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m attr \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m.obj:\n\u001B[32m 1363\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m[attr]\n\u001B[32m-> \u001B[39m\u001B[32m1365\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mAttributeError\u001B[39;00m(\n\u001B[32m 1366\u001B[39m \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mtype\u001B[39m(\u001B[38;5;28mself\u001B[39m).\u001B[34m__name__\u001B[39m\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m object has no attribute \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mattr\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 1367\u001B[39m )\n", "\u001B[31mAttributeError\u001B[39m: 'SeriesGroupBy' object has no attribute 'passive_imp_ent'" ] } ], "execution_count": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Визуализация: доля пассивных ent vs CTR" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.qcut(client[\"passive_ent_share\"], 8, duplicates=\"drop\")\nmed = client.groupby(bins)[\"ctr_all\"].median().reset_index()\nmed[\"passive_ent_share\"] = med[\"passive_ent_share\"].astype(str)\nplt.figure(figsize=(12, 4))\nsns.lineplot(data=med, x=\"passive_ent_share\", y=\"ctr_all\", marker=\"o\")\nplt.xticks(rotation=40)\nplt.title(\"CTR vs доля пассивных ent показов\")\nplt.tight_layout()\nplt.show()\nmed\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ML-модель на high CTR" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = client[[\"passive_ent_share\", \"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\ny = client[\"high_ctr\"]\nX = X.copy()\nX[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\nX[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n\nnumeric_cols = [\"passive_ent_share\", \"imp_total\", \"age\"]\ncat_cols = [\"gender_cd\", \"device_platform_cd\"]\n\npre = ColumnTransformer(\n [\n (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n ]\n)\n\nmodel = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\nmodel.fit(X_train, y_train)\nproba = model.predict_proba(X_test)[:, 1]\nauc = roc_auc_score(y_test, proba)\ncoef = model.named_steps[\"clf\"].coef_[0]\nfeatures = model.named_steps[\"pre\"].get_feature_names_out()\ncoef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\nauc, coef_series.head(10)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Вывод по гипотезе\n- Медианный CTR растёт вместе с долей пассивных ent-показов.\n- В модели `passive_ent_share` — топ-фича с положительным знаком, AUC ~0.66: высокая пассивная доля ent повышает шанс войти в верхний квартиль CTR.\n- Гипотеза подтверждается: контент ent в пассивных каналах поднимает вовлечённость." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.13" } }, "nbformat": 4, "nbformat_minor": 2 }