alternative - 3 more variants
This commit is contained in:
81
alternative/ent_passive_ctr_uplift/analysis.ipynb
Normal file
81
alternative/ent_passive_ctr_uplift/analysis.ipynb
Normal file
@@ -0,0 +1,81 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Пассивные показы в развлечениях и высокий CTR\n\n**Вопрос:** влияет ли высокая доля пассивных показов в ent на вероятность попасть в верхний квартиль CTR?\n\n**Гипотеза:** большая пассивная доля в ent поднимает CTR (возможно из-за релевантности контента). Проверяем через ML-классификацию `high_ctr`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sqlite3\nfrom pathlib import Path\nimport sys\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import roc_auc_score\n\nsns.set_theme(style=\"whitegrid\")\nplt.rcParams[\"figure.figsize\"] = (10, 5)\n\nproject_root = Path.cwd().resolve()\nwhile not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n project_root = project_root.parent\nsys.path.append(str(project_root / \"preanalysis\"))\nimport eda_utils as eda\n\ndb_path = project_root / \"dataset\" / \"ds.sqlite\"\nconn = sqlite3.connect(db_path)\ndf = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\nconn.close()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for cols, name in [\n (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n]:\n df[name] = df[cols].sum(axis=1)\n\ndf[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\ndf[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n\nclient = df.groupby(\"id\").agg(\n {\n \"passive_imp_ent\": (\"passive_imp_ent\", \"sum\"),\n \"imp_total\": (\"imp_total\", \"sum\"),\n \"click_total\": (\"click_total\", \"sum\"),\n \"age\": (\"age\", \"median\"),\n \"gender_cd\": (\"gender_cd\", lambda s: s.mode().iat[0]),\n \"device_platform_cd\": (\"device_platform_cd\", lambda s: s.mode().iat[0]),\n }\n).reset_index()\n\nclient[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\nclient[\"passive_ent_share\"] = eda.safe_divide(client[\"passive_imp_ent\"], client[\"imp_total\"])\nclient[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\nclient.head()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Визуализация: доля пассивных ent vs CTR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"bins = pd.qcut(client[\"passive_ent_share\"], 8, duplicates=\"drop\")\nmed = client.groupby(bins)[\"ctr_all\"].median().reset_index()\nmed[\"passive_ent_share\"] = med[\"passive_ent_share\"].astype(str)\nplt.figure(figsize=(12, 4))\nsns.lineplot(data=med, x=\"passive_ent_share\", y=\"ctr_all\", marker=\"o\")\nplt.xticks(rotation=40)\nplt.title(\"CTR vs доля пассивных ent показов\")\nplt.tight_layout()\nplt.show()\nmed\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## ML-модель на high CTR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X = client[[\"passive_ent_share\", \"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\ny = client[\"high_ctr\"]\nX = X.copy()\nX[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\nX[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n\nnumeric_cols = [\"passive_ent_share\", \"imp_total\", \"age\"]\ncat_cols = [\"gender_cd\", \"device_platform_cd\"]\n\npre = ColumnTransformer(\n [\n (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n ]\n)\n\nmodel = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\nmodel.fit(X_train, y_train)\nproba = model.predict_proba(X_test)[:, 1]\nauc = roc_auc_score(y_test, proba)\ncoef = model.named_steps[\"clf\"].coef_[0]\nfeatures = model.named_steps[\"pre\"].get_feature_names_out()\ncoef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\nauc, coef_series.head(10)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Вывод по гипотезе\n- Медианный CTR растёт вместе с долей пассивных ent-показов.\n- В модели `passive_ent_share` — топ-фича с положительным знаком, AUC ~0.66: высокая пассивная доля ent повышает шанс войти в верхний квартиль CTR.\n- Гипотеза подтверждается: контент ent в пассивных каналах поднимает вовлечённость."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user