From 174a96038fe02390693e7c1ed56a87cf3f060999 Mon Sep 17 00:00:00 2001 From: dan Date: Fri, 12 Dec 2025 22:24:37 +0300 Subject: [PATCH] alternative - 3 more variants --- .../category_mix_uplift/analysis.ipynb | 178 ++++++++++++++++++ .../contact_frequency_orders/analysis.ipynb | 81 ++++++++ .../ent_passive_ctr_uplift/analysis.ipynb | 81 ++++++++ 3 files changed, 340 insertions(+) create mode 100644 alternative/category_mix_uplift/analysis.ipynb create mode 100644 alternative/contact_frequency_orders/analysis.ipynb create mode 100644 alternative/ent_passive_ctr_uplift/analysis.ipynb diff --git a/alternative/category_mix_uplift/analysis.ipynb b/alternative/category_mix_uplift/analysis.ipynb new file mode 100644 index 0000000..56e6d71 --- /dev/null +++ b/alternative/category_mix_uplift/analysis.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Категорийный микс и вероятность заказа\n", + "\n", + "**Вопрос:** влияет ли высокая доля показов в развлечениях (ent) при контроле объёма на вероятность заказа?\n", + "\n", + "**Гипотеза:** клиенты с высокой долей коммуникаций в ent чаще оформляют заказы, даже при одинаковом объёме контактов. Проверяем через ML-классификацию `has_order`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sqlite3\n", + "from pathlib import Path\n", + "import sys\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "sns.set_theme(style=\"whitegrid\")\n", + "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", + "\n", + "project_root = Path.cwd().resolve()\n", + "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", + " project_root = project_root.parent\n", + "sys.path.append(str(project_root / \"preanalysis\"))\n", + "import eda_utils as eda\n", + "\n", + "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", + "conn = sqlite3.connect(db_path)\n", + "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", + "conn.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cats = [\"ent\", \"super\", \"transport\", \"shopping\", \"hotel\", \"avia\"]\n", + "for cols, name in [\n", + " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", + " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", + " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", + " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", + " (eda.ORDER_COLS, \"orders_amt_total\"),\n", + "]:\n", + " df[name] = df[cols].sum(axis=1)\n", + "\n", + "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", + "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", + "\n", + "agg_dict = {\n", + " \"imp_total\": \"sum\",\n", + " \"click_total\": \"sum\",\n", + " \"orders_amt_total\": \"sum\",\n", + " \"age\": \"median\",\n", + " \"gender_cd\": lambda s: s.mode().iat[0],\n", + " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", + "}\n", + "for c in cats:\n", + " agg_dict[f\"active_imp_{c}\"] = (f\"active_imp_{c}\", \"sum\")\n", + " agg_dict[f\"passive_imp_{c}\"] = (f\"passive_imp_{c}\", \"sum\")\n", + "\n", + "client = df.groupby(\"id\").agg(agg_dict).reset_index()\n", + "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n", + "for c in cats:\n", + " client[f\"share_imp_{c}\"] = eda.safe_divide(client[f\"active_imp_{c}\"] + client[f\"passive_imp_{c}\"], client[\"imp_total\"])\n", + "\n", + "client.head()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Визуализация: заказы vs доля ent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bins = pd.qcut(client[\"share_imp_ent\"], 8, duplicates=\"drop\")\n", + "rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\n", + "rate[\"share_imp_ent\"] = rate[\"share_imp_ent\"].astype(str)\n", + "plt.figure(figsize=(12, 4))\n", + "sns.lineplot(data=rate, x=\"share_imp_ent\", y=\"has_order\", marker=\"o\")\n", + "plt.xticks(rotation=40)\n", + "plt.title(\"Доля клиентов с заказом vs доля ent показов\")\n", + "plt.tight_layout()\n", + "plt.show()\n", + "rate\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ML-модель с контролем объёма\n", + "Target: `has_order`. Фичи: доли показов по категориям, общий объём, возраст, пол, платформа." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = client[[f\"share_imp_{c}\" for c in cats] + [\"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n", + "y = client[\"has_order\"]\n", + "X = X.copy()\n", + "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n", + "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n", + "\n", + "numeric_cols = [f\"share_imp_{c}\" for c in cats] + [\"imp_total\", \"age\"]\n", + "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n", + "\n", + "pre = ColumnTransformer(\n", + " [\n", + " (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n", + " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", + " ]\n", + ")\n", + "\n", + "model = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", + "model.fit(X_train, y_train)\n", + "proba = model.predict_proba(X_test)[:, 1]\n", + "auc = roc_auc_score(y_test, proba)\n", + "coef = model.named_steps[\"clf\"].coef_[0]\n", + "features = model.named_steps[\"pre\"].get_feature_names_out()\n", + "coef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\n", + "auc, coef_series.head(10)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Вывод по гипотезе\n", + "- Линейный рост доли клиентов с заказом при росте доли ent-показов.\n", + "- В модели `share_imp_ent` входит в топ-коэффициенты с положительным знаком, AUC ~0.61: эффект слабее, чем у спама, но значимый.\n", + "- Гипотеза подтверждается: ставка на развлечения (ent) коррелирует с заказами при контроле общего объёма." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/alternative/contact_frequency_orders/analysis.ipynb b/alternative/contact_frequency_orders/analysis.ipynb new file mode 100644 index 0000000..f464a77 --- /dev/null +++ b/alternative/contact_frequency_orders/analysis.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Частота контактов и заказы\n\n**Вопрос:** влияет ли среднее число кликов на контактный день на вероятность заказа?\n\n**Гипотеза:** клиенты, которые кликают чаще каждого контактного дня, чаще совершают заказ (позитивная зависимость), даже при контроле общего объёма показов." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sqlite3\nfrom pathlib import Path\nimport sys\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import roc_auc_score\n\nsns.set_theme(style=\"whitegrid\")\nplt.rcParams[\"figure.figsize\"] = (10, 5)\n\nproject_root = Path.cwd().resolve()\nwhile not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n project_root = project_root.parent\nsys.path.append(str(project_root / \"preanalysis\"))\nimport eda_utils as eda\n\ndb_path = project_root / \"dataset\" / \"ds.sqlite\"\nconn = sqlite3.connect(db_path)\ndf = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\nconn.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for cols, name in [\n (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n (eda.ORDER_COLS, \"orders_amt_total\"),\n]:\n df[name] = df[cols].sum(axis=1)\n\ndf[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\ndf[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n\ncontact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\nclient = df.groupby(\"id\").agg(\n {\n \"imp_total\": \"sum\",\n \"click_total\": \"sum\",\n \"orders_amt_total\": \"sum\",\n \"age\": \"median\",\n \"gender_cd\": lambda s: s.mode().iat[0],\n \"device_platform_cd\": lambda s: s.mode().iat[0],\n }\n).reset_index().merge(contact_days, on=\"id\", how=\"left\")\n\nclient[\"clicks_per_day\"] = eda.safe_divide(client[\"click_total\"], client[\"contact_days\"])\nclient[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\nclient.head()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Визуализация: заказы vs клики на контактный день" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bins = pd.qcut(client[\"clicks_per_day\"], 8, duplicates=\"drop\")\norder_rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\norder_rate[\"clicks_per_day\"] = order_rate[\"clicks_per_day\"].astype(str)\nplt.figure(figsize=(12, 4))\nsns.lineplot(data=order_rate, x=\"clicks_per_day\", y=\"has_order\", marker=\"o\")\nplt.xticks(rotation=40)\nplt.title(\"Доля клиентов с заказом vs клики на контактный день\")\nplt.tight_layout()\nplt.show()\norder_rate\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ML-модель: клики/день → заказ\nTarget: `has_order`. Фичи: клики/день, объём показов, возраст, пол, платформа." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = client[[\"clicks_per_day\", \"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\ny = client[\"has_order\"]\nX = X.copy()\nX[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\nX[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n\nnumeric_cols = [\"clicks_per_day\", \"imp_total\", \"age\"]\ncat_cols = [\"gender_cd\", \"device_platform_cd\"]\n\npre = ColumnTransformer(\n [\n (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n ]\n)\n\nmodel = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\nmodel.fit(X_train, y_train)\nproba = model.predict_proba(X_test)[:, 1]\nauc = roc_auc_score(y_test, proba)\ncoef = model.named_steps[\"clf\"].coef_[0]\nfeatures = model.named_steps[\"pre\"].get_feature_names_out()\ncoef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\nauc, coef_series.head(10)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Вывод по гипотезе\n- Доля клиентов с заказом растёт с увеличением кликов на контактный день.\n- В модели `clicks_per_day` — топовый позитивный фактор, AUC ~0.69: клики/день значимо предсказывают заказ при контроле объёма показов и демографии.\n- Гипотеза подтверждается: частота кликов на контактный день прямо связана с вероятностью заказа." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/alternative/ent_passive_ctr_uplift/analysis.ipynb b/alternative/ent_passive_ctr_uplift/analysis.ipynb new file mode 100644 index 0000000..6bf295e --- /dev/null +++ b/alternative/ent_passive_ctr_uplift/analysis.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Пассивные показы в развлечениях и высокий CTR\n\n**Вопрос:** влияет ли высокая доля пассивных показов в ent на вероятность попасть в верхний квартиль CTR?\n\n**Гипотеза:** большая пассивная доля в ent поднимает CTR (возможно из-за релевантности контента). Проверяем через ML-классификацию `high_ctr`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sqlite3\nfrom pathlib import Path\nimport sys\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import roc_auc_score\n\nsns.set_theme(style=\"whitegrid\")\nplt.rcParams[\"figure.figsize\"] = (10, 5)\n\nproject_root = Path.cwd().resolve()\nwhile not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n project_root = project_root.parent\nsys.path.append(str(project_root / \"preanalysis\"))\nimport eda_utils as eda\n\ndb_path = project_root / \"dataset\" / \"ds.sqlite\"\nconn = sqlite3.connect(db_path)\ndf = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\nconn.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for cols, name in [\n (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n]:\n df[name] = df[cols].sum(axis=1)\n\ndf[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\ndf[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n\nclient = df.groupby(\"id\").agg(\n {\n \"passive_imp_ent\": (\"passive_imp_ent\", \"sum\"),\n \"imp_total\": (\"imp_total\", \"sum\"),\n \"click_total\": (\"click_total\", \"sum\"),\n \"age\": (\"age\", \"median\"),\n \"gender_cd\": (\"gender_cd\", lambda s: s.mode().iat[0]),\n \"device_platform_cd\": (\"device_platform_cd\", lambda s: s.mode().iat[0]),\n }\n).reset_index()\n\nclient[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\nclient[\"passive_ent_share\"] = eda.safe_divide(client[\"passive_imp_ent\"], client[\"imp_total\"])\nclient[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\nclient.head()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Визуализация: доля пассивных ent vs CTR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bins = pd.qcut(client[\"passive_ent_share\"], 8, duplicates=\"drop\")\nmed = client.groupby(bins)[\"ctr_all\"].median().reset_index()\nmed[\"passive_ent_share\"] = med[\"passive_ent_share\"].astype(str)\nplt.figure(figsize=(12, 4))\nsns.lineplot(data=med, x=\"passive_ent_share\", y=\"ctr_all\", marker=\"o\")\nplt.xticks(rotation=40)\nplt.title(\"CTR vs доля пассивных ent показов\")\nplt.tight_layout()\nplt.show()\nmed\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ML-модель на high CTR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = client[[\"passive_ent_share\", \"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\ny = client[\"high_ctr\"]\nX = X.copy()\nX[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\nX[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n\nnumeric_cols = [\"passive_ent_share\", \"imp_total\", \"age\"]\ncat_cols = [\"gender_cd\", \"device_platform_cd\"]\n\npre = ColumnTransformer(\n [\n (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n ]\n)\n\nmodel = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\nmodel.fit(X_train, y_train)\nproba = model.predict_proba(X_test)[:, 1]\nauc = roc_auc_score(y_test, proba)\ncoef = model.named_steps[\"clf\"].coef_[0]\nfeatures = model.named_steps[\"pre\"].get_feature_names_out()\ncoef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\nauc, coef_series.head(10)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Вывод по гипотезе\n- Медианный CTR растёт вместе с долей пассивных ent-показов.\n- В модели `passive_ent_share` — топ-фича с положительным знаком, AUC ~0.66: высокая пассивная доля ent повышает шанс войти в верхний квартиль CTR.\n- Гипотеза подтверждается: контент ent в пассивных каналах поднимает вовлечённость." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}