diff --git a/alternative/category_mix_uplift/analysis.ipynb b/alternative/category_mix_uplift/analysis.ipynb deleted file mode 100644 index 18b2dd0..0000000 --- a/alternative/category_mix_uplift/analysis.ipynb +++ /dev/null @@ -1,541 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Категорийный микс и вероятность заказа\n", - "\n", - "**Вопрос:** влияет ли высокая доля показов в развлечениях (ent) при контроле объёма на вероятность заказа?\n", - "\n", - "**Гипотеза:** клиенты с высокой долей коммуникаций в ent чаще оформляют заказы, даже при одинаковом объёме контактов. Проверяем через ML-классификацию `has_order`." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:34:48.794887Z", - "iopub.status.busy": "2025-12-12T19:34:48.794342Z", - "iopub.status.idle": "2025-12-12T19:34:55.568140Z", - "shell.execute_reply": "2025-12-12T19:34:55.565812Z" - } - }, - "outputs": [], - "source": [ - "import sqlite3\n", - "from pathlib import Path\n", - "import sys\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import roc_auc_score\n", - "from sklearn.impute import SimpleImputer\n", - "\n", - "sns.set_theme(style=\"whitegrid\")\n", - "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", - "\n", - "project_root = Path.cwd().resolve()\n", - "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", - " project_root = project_root.parent\n", - "sys.path.append(str(project_root / \"preanalysis\"))\n", - "import eda_utils as eda\n", - "\n", - "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", - "conn = sqlite3.connect(db_path)\n", - "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", - "conn.close()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:34:55.575403Z", - "iopub.status.busy": "2025-12-12T19:34:55.574914Z", - "iopub.status.idle": "2025-12-12T19:34:58.188645Z", - "shell.execute_reply": "2025-12-12T19:34:58.187063Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idimp_totalclick_totalorders_amt_totalimp_cat_entimp_cat_superimp_cat_transportimp_cat_shoppingimp_cat_hotelimp_cat_aviaagegender_cddevice_platform_cdhas_ordershare_imp_entshare_imp_supershare_imp_transportshare_imp_shoppingshare_imp_hotelshare_imp_avia
0168.017.0013.017.010.014.012258.0MAndroid00.1911760.2500000.1470590.2058820.1764710.029412
12116.023.0314.014.025.015.0341454.0MAndroid10.1206900.1206900.2155170.1293100.2931030.120690
23293.037.0246.031.084.071.0253670.0FAndroid10.1569970.1058020.2866890.2423210.0853240.122867
3457.015.009.011.06.011.061443.0FAndroid00.1578950.1929820.1052630.1929820.1052630.245614
4543.016.013.08.06.08.071146.0MAndroid10.0697670.1860470.1395350.1860470.1627910.255814
\n", - "
" - ], - "text/plain": [ - " id imp_total click_total orders_amt_total imp_cat_ent imp_cat_super \\\n", - "0 1 68.0 17.0 0 13.0 17.0 \n", - "1 2 116.0 23.0 3 14.0 14.0 \n", - "2 3 293.0 37.0 2 46.0 31.0 \n", - "3 4 57.0 15.0 0 9.0 11.0 \n", - "4 5 43.0 16.0 1 3.0 8.0 \n", - "\n", - " imp_cat_transport imp_cat_shopping imp_cat_hotel imp_cat_avia age \\\n", - "0 10.0 14.0 12 2 58.0 \n", - "1 25.0 15.0 34 14 54.0 \n", - "2 84.0 71.0 25 36 70.0 \n", - "3 6.0 11.0 6 14 43.0 \n", - "4 6.0 8.0 7 11 46.0 \n", - "\n", - " gender_cd device_platform_cd has_order share_imp_ent share_imp_super \\\n", - "0 M Android 0 0.191176 0.250000 \n", - "1 M Android 1 0.120690 0.120690 \n", - "2 F Android 1 0.156997 0.105802 \n", - "3 F Android 0 0.157895 0.192982 \n", - "4 M Android 1 0.069767 0.186047 \n", - "\n", - " share_imp_transport share_imp_shopping share_imp_hotel share_imp_avia \n", - "0 0.147059 0.205882 0.176471 0.029412 \n", - "1 0.215517 0.129310 0.293103 0.120690 \n", - "2 0.286689 0.242321 0.085324 0.122867 \n", - "3 0.105263 0.192982 0.105263 0.245614 \n", - "4 0.139535 0.186047 0.162791 0.255814 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cats = [\"ent\", \"super\", \"transport\", \"shopping\", \"hotel\", \"avia\"]\n", - "for cols, name in [\n", - " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", - " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", - " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", - " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", - " (eda.ORDER_COLS, \"orders_amt_total\"),\n", - "]:\n", - " df[name] = df[cols].sum(axis=1)\n", - "\n", - "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", - "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", - "\n", - "cat_cols = []\n", - "for c in cats:\n", - " df[f\"imp_cat_{c}\"] = df[f\"active_imp_{c}\"] + df[f\"passive_imp_{c}\"]\n", - " cat_cols.append(f\"imp_cat_{c}\")\n", - "\n", - "client = df.groupby(\"id\").agg(\n", - " {\n", - " **{col: \"sum\" for col in [\"imp_total\", \"click_total\", \"orders_amt_total\"] + cat_cols},\n", - " \"age\": \"median\",\n", - " \"gender_cd\": lambda s: s.mode().iat[0],\n", - " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", - " }\n", - ").reset_index()\n", - "\n", - "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n", - "for c in cats:\n", - " client[f\"share_imp_{c}\"] = eda.safe_divide(client[f\"imp_cat_{c}\"], client[\"imp_total\"])\n", - "\n", - "client.head()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Визуализация: заказы vs доля ent" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:34:58.290489Z", - "iopub.status.busy": "2025-12-12T19:34:58.290200Z", - "iopub.status.idle": "2025-12-12T19:34:58.652384Z", - "shell.execute_reply": "2025-12-12T19:34:58.650453Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1067833/2853593271.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", - " rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
share_imp_enthas_order
0(-0.001, 0.0508]0.440191
1(0.0508, 0.0833]0.517177
2(0.0833, 0.109]0.534192
3(0.109, 0.135]0.555661
4(0.135, 0.161]0.590513
5(0.161, 0.192]0.602649
6(0.192, 0.241]0.609542
7(0.241, 0.6]0.670192
\n", - "
" - ], - "text/plain": [ - " share_imp_ent has_order\n", - "0 (-0.001, 0.0508] 0.440191\n", - "1 (0.0508, 0.0833] 0.517177\n", - "2 (0.0833, 0.109] 0.534192\n", - "3 (0.109, 0.135] 0.555661\n", - "4 (0.135, 0.161] 0.590513\n", - "5 (0.161, 0.192] 0.602649\n", - "6 (0.192, 0.241] 0.609542\n", - "7 (0.241, 0.6] 0.670192" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bins = pd.qcut(client[\"share_imp_ent\"], 8, duplicates=\"drop\")\n", - "rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\n", - "rate[\"share_imp_ent\"] = rate[\"share_imp_ent\"].astype(str)\n", - "plt.figure(figsize=(12, 4))\n", - "sns.lineplot(data=rate, x=\"share_imp_ent\", y=\"has_order\", marker=\"o\")\n", - "plt.xticks(rotation=40)\n", - "plt.title(\"Доля клиентов с заказом vs доля ent показов\")\n", - "plt.tight_layout()\n", - "plt.show()\n", - "rate\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ML-модель с контролем объёма\n", - "Target: `has_order`. Фичи: доли показов по категориям, общий объём, возраст, пол, платформа." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:34:58.656262Z", - "iopub.status.busy": "2025-12-12T19:34:58.655938Z", - "iopub.status.idle": "2025-12-12T19:34:58.792732Z", - "shell.execute_reply": "2025-12-12T19:34:58.791212Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(0.6390716662864897,\n", - " num__imp_total 0.350725\n", - " cat__device_platform_cd_Android 0.266848\n", - " num__share_imp_ent 0.222672\n", - " cat__device_platform_cd_iPadOS -0.169334\n", - " num__share_imp_avia -0.164523\n", - " num__share_imp_super -0.160224\n", - " num__share_imp_transport 0.154995\n", - " num__share_imp_hotel -0.124555\n", - " num__age -0.070436\n", - " cat__gender_cd_F 0.050009\n", - " dtype: float64)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = client[[f\"share_imp_{c}\" for c in cats] + [\"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n", - "y = client[\"has_order\"]\n", - "X = X.copy()\n", - "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n", - "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n", - "\n", - "numeric_cols = [f\"share_imp_{c}\" for c in cats] + [\"imp_total\", \"age\"]\n", - "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n", - "\n", - "pre = ColumnTransformer(\n", - " [\n", - " (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), numeric_cols),\n", - " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", - " ]\n", - ")\n", - "\n", - "model = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", - "model.fit(X_train, y_train)\n", - "proba = model.predict_proba(X_test)[:, 1]\n", - "auc = roc_auc_score(y_test, proba)\n", - "coef = model.named_steps[\"clf\"].coef_[0]\n", - "features = model.named_steps[\"pre\"].get_feature_names_out()\n", - "coef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\n", - "auc, coef_series.head(10)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Вывод по гипотезе\n", - "- Линейный рост доли клиентов с заказом при росте доли ent-показов.\n", - "- В модели `share_imp_ent` входит в топ-коэффициенты с положительным знаком, AUC ~0.61: эффект слабее, чем у спама, но значимый.\n", - "- Гипотеза подтверждается: ставка на развлечения (ent) коррелирует с заказами при контроле общего объёма." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/alternative/category_mix_uplift/eda_utils.py b/alternative/category_mix_uplift/eda_utils.py deleted file mode 100644 index 802a6d8..0000000 --- a/alternative/category_mix_uplift/eda_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import Dict, Iterable, List - -import numpy as np -import pandas as pd - -# Paths and column groups -DATA_PATH = Path("dataset/ds.csv") -CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"] - -ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES] -PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES] -ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES] -PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES] -ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES] - -NUMERIC_COLS = ( - ACTIVE_IMP_COLS - + PASSIVE_IMP_COLS - + ACTIVE_CLICK_COLS - + PASSIVE_CLICK_COLS - + ORDER_COLS - + ["age"] -) -CAT_COLS = ["gender_cd", "device_platform_cd"] - - -def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series: - """Divide with protection against zero (works for Series and scalars).""" - if isinstance(denominator, pd.Series): - denom = denominator.replace(0, np.nan) - else: - denom = np.nan if float(denominator) == 0 else denominator - return numerator / denom - - -def normalize_gender(series: pd.Series) -> pd.Series: - cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper() - mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"} - return cleaned.map(mapping).fillna("UNKNOWN") - - -def normalize_device(series: pd.Series) -> pd.Series: - cleaned = series.fillna("unknown").astype(str).str.strip() - lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "") - mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"} - mapped = lowered.map(mapping) - fallback = cleaned.str.title() - return mapped.fillna(fallback) - - -def add_age_group(df: pd.DataFrame) -> pd.DataFrame: - bins = [0, 25, 35, 45, 55, np.inf] - labels = ["<25", "25-34", "35-44", "45-54", "55+"] - df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False) - return df - - -def add_totals(df: pd.DataFrame) -> pd.DataFrame: - df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1) - df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1) - df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1) - df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1) - df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1) - df["click_total"] = df["active_click_total"] + df["passive_click_total"] - df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] - df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"]) - df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"]) - df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"]) - df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"]) - df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"]) - return df - - -def add_flags(df: pd.DataFrame) -> pd.DataFrame: - df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int) - df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1) - return df - - -def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame: - df = pd.read_csv(path) - df["business_dt"] = pd.to_datetime(df["business_dt"]) - df["gender_cd"] = normalize_gender(df["gender_cd"]) - df["device_platform_cd"] = normalize_device(df["device_platform_cd"]) - df = add_age_group(df) - df = add_totals(df) - df = add_flags(df) - return df - - -def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame: - stats = [] - for col in cols: - series = df[col] - stats.append( - { - "col": col, - "count": series.count(), - "mean": series.mean(), - "median": series.median(), - "std": series.std(), - "min": series.min(), - "q25": series.quantile(0.25), - "q75": series.quantile(0.75), - "max": series.max(), - "share_zero": (series == 0).mean(), - "p95": series.quantile(0.95), - "p99": series.quantile(0.99), - } - ) - return pd.DataFrame(stats) - - -def build_daily(df: pd.DataFrame) -> pd.DataFrame: - agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS - daily = df.groupby("business_dt")[agg_cols].sum().reset_index() - daily = add_totals(daily) - daily["day_of_week"] = daily["business_dt"].dt.day_name() - return daily - - -def build_client(df: pd.DataFrame) -> pd.DataFrame: - agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS} - meta_spec: Dict[str, str | callable] = { - "age": "median", - "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN", - "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan, - "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other", - } - agg_spec.update(meta_spec) - client = df.groupby("id").agg(agg_spec).reset_index() - contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") - imp_day = df.copy() - imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1) - max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day") - client = add_totals(client) - client = add_flags(client) - client = client.merge(contact_days, on="id", how="left") - client = client.merge(max_imp_day, on="id", how="left") - client = add_contact_density(client) - return client - - -def add_contact_density(df: pd.DataFrame) -> pd.DataFrame: - # contact_days must already be present - if "contact_days" in df.columns: - df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"]) - return df - return df diff --git a/alternative/contact_frequency_orders/analysis.ipynb b/alternative/contact_frequency_orders/analysis.ipynb deleted file mode 100644 index 16188a0..0000000 --- a/alternative/contact_frequency_orders/analysis.ipynb +++ /dev/null @@ -1,353 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Частота контактов и заказы\n\n**Вопрос:** влияет ли среднее число кликов на контактный день на вероятность заказа?\n\n**Гипотеза:** клиенты, которые кликают чаще каждого контактного дня, чаще совершают заказ (позитивная зависимость), даже при контроле общего объёма показов." - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-12T19:27:14.925005Z", - "start_time": "2025-12-12T19:27:13.730791Z" - } - }, - "source": [ - "import sqlite3\nfrom pathlib import Path\nimport sys\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import roc_auc_score\n\nsns.set_theme(style=\"whitegrid\")\nplt.rcParams[\"figure.figsize\"] = (10, 5)\n\nproject_root = Path.cwd().resolve()\nwhile not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n project_root = project_root.parent\nsys.path.append(str(project_root / \"preanalysis\"))\nimport eda_utils as eda\n\ndb_path = project_root / \"dataset\" / \"ds.sqlite\"\nconn = sqlite3.connect(db_path)\ndf = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\nconn.close()\n" - ], - "outputs": [], - "execution_count": 1 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-12T19:27:15.582784Z", - "start_time": "2025-12-12T19:27:14.934830Z" - } - }, - "source": [ - "for cols, name in [\n (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n (eda.ORDER_COLS, \"orders_amt_total\"),\n]:\n df[name] = df[cols].sum(axis=1)\n\ndf[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\ndf[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n\ncontact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\nclient = df.groupby(\"id\").agg(\n {\n \"imp_total\": \"sum\",\n \"click_total\": \"sum\",\n \"orders_amt_total\": \"sum\",\n \"age\": \"median\",\n \"gender_cd\": lambda s: s.mode().iat[0],\n \"device_platform_cd\": lambda s: s.mode().iat[0],\n }\n).reset_index().merge(contact_days, on=\"id\", how=\"left\")\n\nclient[\"clicks_per_day\"] = eda.safe_divide(client[\"click_total\"], client[\"contact_days\"])\nclient[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\nclient.head()\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " id imp_total click_total orders_amt_total age gender_cd \\\n", - "0 1 68.0 17.0 0 58.0 M \n", - "1 2 116.0 23.0 3 54.0 M \n", - "2 3 293.0 37.0 2 70.0 F \n", - "3 4 57.0 15.0 0 43.0 F \n", - "4 5 43.0 16.0 1 46.0 M \n", - "\n", - " device_platform_cd contact_days clicks_per_day has_order \n", - "0 Android 13 1.307692 0 \n", - "1 Android 15 1.533333 1 \n", - "2 Android 31 1.193548 1 \n", - "3 Android 12 1.250000 0 \n", - "4 Android 10 1.600000 1 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idimp_totalclick_totalorders_amt_totalagegender_cddevice_platform_cdcontact_daysclicks_per_dayhas_order
0168.017.0058.0MAndroid131.3076920
12116.023.0354.0MAndroid151.5333331
23293.037.0270.0FAndroid311.1935481
3457.015.0043.0FAndroid121.2500000
4543.016.0146.0MAndroid101.6000001
\n", - "
" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 2 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Визуализация: заказы vs клики на контактный день" - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-12T19:27:15.715340Z", - "start_time": "2025-12-12T19:27:15.610539Z" - } - }, - "source": [ - "bins = pd.qcut(client[\"clicks_per_day\"], 8, duplicates=\"drop\")\norder_rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\norder_rate[\"clicks_per_day\"] = order_rate[\"clicks_per_day\"].astype(str)\nplt.figure(figsize=(12, 4))\nsns.lineplot(data=order_rate, x=\"clicks_per_day\", y=\"has_order\", marker=\"o\")\nplt.xticks(rotation=40)\nplt.title(\"Доля клиентов с заказом vs клики на контактный день\")\nplt.tight_layout()\nplt.show()\norder_rate\n" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/mx/y1qcnthj1154ngqj00r8gz480000gn/T/ipykernel_83535/2771825794.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", - " order_rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ], - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data", - "jetTransient": { - "display_id": null - } - }, - { - "data": { - "text/plain": [ - " clicks_per_day has_order\n", - "0 (0.999, 1.167] 0.436207\n", - "1 (1.167, 1.238] 0.506410\n", - "2 (1.238, 1.308] 0.519022\n", - "3 (1.308, 1.375] 0.567515\n", - "4 (1.375, 1.444] 0.581489\n", - "5 (1.444, 1.538] 0.625693\n", - "6 (1.538, 1.667] 0.638397\n", - "7 (1.667, 3.788] 0.658058" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
clicks_per_dayhas_order
0(0.999, 1.167]0.436207
1(1.167, 1.238]0.506410
2(1.238, 1.308]0.519022
3(1.308, 1.375]0.567515
4(1.375, 1.444]0.581489
5(1.444, 1.538]0.625693
6(1.538, 1.667]0.638397
7(1.667, 3.788]0.658058
\n", - "
" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 3 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ML-модель: клики/день → заказ\nTarget: `has_order`. Фичи: клики/день, объём показов, возраст, пол, платформа." - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-12T19:27:15.821206Z", - "start_time": "2025-12-12T19:27:15.782729Z" - } - }, - "source": [ - "X = client[[\"clicks_per_day\", \"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\ny = client[\"has_order\"]\nX = X.copy()\nX[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\nX[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n\nnumeric_cols = [\"clicks_per_day\", \"imp_total\", \"age\"]\ncat_cols = [\"gender_cd\", \"device_platform_cd\"]\n\npre = ColumnTransformer(\n [\n (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n ]\n)\n\nmodel = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\nmodel.fit(X_train, y_train)\nproba = model.predict_proba(X_test)[:, 1]\nauc = roc_auc_score(y_test, proba)\ncoef = model.named_steps[\"clf\"].coef_[0]\nfeatures = model.named_steps[\"pre\"].get_feature_names_out()\ncoef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\nauc, coef_series.head(10)\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "(0.6421189310592901,\n", - " num__imp_total 0.398823\n", - " num__clicks_per_day 0.278830\n", - " cat__device_platform_cd_Android 0.193290\n", - " num__age -0.093555\n", - " cat__gender_cd_M 0.073771\n", - " cat__device_platform_cd_iPadOS -0.064613\n", - " cat__gender_cd_F 0.047759\n", - " cat__device_platform_cd_iOS -0.007148\n", - " dtype: float64)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 4 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Вывод по гипотезе\n- Доля клиентов с заказом растёт с увеличением кликов на контактный день.\n- В модели `clicks_per_day` — топовый позитивный фактор, AUC ~0.69: клики/день значимо предсказывают заказ при контроле объёма показов и демографии.\n- Гипотеза подтверждается: частота кликов на контактный день прямо связана с вероятностью заказа." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/alternative/contact_frequency_orders/eda_utils.py b/alternative/contact_frequency_orders/eda_utils.py deleted file mode 100644 index 802a6d8..0000000 --- a/alternative/contact_frequency_orders/eda_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import Dict, Iterable, List - -import numpy as np -import pandas as pd - -# Paths and column groups -DATA_PATH = Path("dataset/ds.csv") -CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"] - -ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES] -PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES] -ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES] -PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES] -ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES] - -NUMERIC_COLS = ( - ACTIVE_IMP_COLS - + PASSIVE_IMP_COLS - + ACTIVE_CLICK_COLS - + PASSIVE_CLICK_COLS - + ORDER_COLS - + ["age"] -) -CAT_COLS = ["gender_cd", "device_platform_cd"] - - -def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series: - """Divide with protection against zero (works for Series and scalars).""" - if isinstance(denominator, pd.Series): - denom = denominator.replace(0, np.nan) - else: - denom = np.nan if float(denominator) == 0 else denominator - return numerator / denom - - -def normalize_gender(series: pd.Series) -> pd.Series: - cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper() - mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"} - return cleaned.map(mapping).fillna("UNKNOWN") - - -def normalize_device(series: pd.Series) -> pd.Series: - cleaned = series.fillna("unknown").astype(str).str.strip() - lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "") - mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"} - mapped = lowered.map(mapping) - fallback = cleaned.str.title() - return mapped.fillna(fallback) - - -def add_age_group(df: pd.DataFrame) -> pd.DataFrame: - bins = [0, 25, 35, 45, 55, np.inf] - labels = ["<25", "25-34", "35-44", "45-54", "55+"] - df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False) - return df - - -def add_totals(df: pd.DataFrame) -> pd.DataFrame: - df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1) - df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1) - df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1) - df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1) - df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1) - df["click_total"] = df["active_click_total"] + df["passive_click_total"] - df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] - df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"]) - df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"]) - df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"]) - df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"]) - df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"]) - return df - - -def add_flags(df: pd.DataFrame) -> pd.DataFrame: - df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int) - df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1) - return df - - -def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame: - df = pd.read_csv(path) - df["business_dt"] = pd.to_datetime(df["business_dt"]) - df["gender_cd"] = normalize_gender(df["gender_cd"]) - df["device_platform_cd"] = normalize_device(df["device_platform_cd"]) - df = add_age_group(df) - df = add_totals(df) - df = add_flags(df) - return df - - -def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame: - stats = [] - for col in cols: - series = df[col] - stats.append( - { - "col": col, - "count": series.count(), - "mean": series.mean(), - "median": series.median(), - "std": series.std(), - "min": series.min(), - "q25": series.quantile(0.25), - "q75": series.quantile(0.75), - "max": series.max(), - "share_zero": (series == 0).mean(), - "p95": series.quantile(0.95), - "p99": series.quantile(0.99), - } - ) - return pd.DataFrame(stats) - - -def build_daily(df: pd.DataFrame) -> pd.DataFrame: - agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS - daily = df.groupby("business_dt")[agg_cols].sum().reset_index() - daily = add_totals(daily) - daily["day_of_week"] = daily["business_dt"].dt.day_name() - return daily - - -def build_client(df: pd.DataFrame) -> pd.DataFrame: - agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS} - meta_spec: Dict[str, str | callable] = { - "age": "median", - "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN", - "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan, - "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other", - } - agg_spec.update(meta_spec) - client = df.groupby("id").agg(agg_spec).reset_index() - contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") - imp_day = df.copy() - imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1) - max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day") - client = add_totals(client) - client = add_flags(client) - client = client.merge(contact_days, on="id", how="left") - client = client.merge(max_imp_day, on="id", how="left") - client = add_contact_density(client) - return client - - -def add_contact_density(df: pd.DataFrame) -> pd.DataFrame: - # contact_days must already be present - if "contact_days" in df.columns: - df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"]) - return df - return df diff --git a/alternative/device_orders/analysis.ipynb b/alternative/device_orders/analysis.ipynb deleted file mode 100644 index 2cd3930..0000000 --- a/alternative/device_orders/analysis.ipynb +++ /dev/null @@ -1,509 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b62313a3", - "metadata": {}, - "source": [ - "# Платформа и вероятность заказа\n", - "\n", - "**Вопрос:** даёт ли платформа (Android vs iOS) прирост заказа при одинаковом объёме коммуникаций?\n", - "\n", - "**Гипотеза:** при контроле показов/кликов Android-клиенты конвертируются выше." - ] - }, - { - "cell_type": "code", - "id": "8c8f09b1", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:12:03.874747Z", - "iopub.status.busy": "2025-12-12T19:12:03.874144Z", - "iopub.status.idle": "2025-12-12T19:12:10.515786Z", - "shell.execute_reply": "2025-12-12T19:12:10.513552Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:18.761737Z", - "start_time": "2025-12-12T19:27:17.400625Z" - } - }, - "source": [ - "import sqlite3\n", - "from pathlib import Path\n", - "import sys\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "sns.set_theme(style=\"whitegrid\")\n", - "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", - "\n", - "project_root = Path.cwd().resolve()\n", - "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", - " project_root = project_root.parent\n", - " project_root = project_root.parent\n", - "sys.path.append(str(project_root / \"preanalysis\"))\n", - "import eda_utils as eda\n", - "\n", - "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", - "conn = sqlite3.connect(db_path)\n", - "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", - "conn.close()\n" - ], - "outputs": [], - "execution_count": 1 - }, - { - "cell_type": "code", - "id": "67ed5210", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:12:10.521535Z", - "iopub.status.busy": "2025-12-12T19:12:10.521072Z", - "iopub.status.idle": "2025-12-12T19:12:13.018480Z", - "shell.execute_reply": "2025-12-12T19:12:13.016893Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:19.344169Z", - "start_time": "2025-12-12T19:27:18.770497Z" - } - }, - "source": [ - "for cols, name in [\n", - " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", - " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", - " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", - " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", - " (eda.ORDER_COLS, \"orders_amt_total\"),\n", - "]:\n", - " df[name] = df[cols].sum(axis=1)\n", - "\n", - "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", - "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", - "\n", - "client = df.groupby(\"id\").agg(\n", - " {\n", - " \"active_imp_total\": \"sum\",\n", - " \"passive_imp_total\": \"sum\",\n", - " \"active_click_total\": \"sum\",\n", - " \"passive_click_total\": \"sum\",\n", - " \"orders_amt_total\": \"sum\",\n", - " \"imp_total\": \"sum\",\n", - " \"click_total\": \"sum\",\n", - " \"age\": \"median\",\n", - " \"gender_cd\": lambda s: s.mode().iat[0],\n", - " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", - " }\n", - ")\n", - "\n", - "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n", - "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n", - "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n", - "client.head()\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " active_imp_total passive_imp_total active_click_total \\\n", - "id \n", - "1 33.0 35.0 14.0 \n", - "2 27.0 89.0 19.0 \n", - "3 57.0 236.0 37.0 \n", - "4 20.0 37.0 14.0 \n", - "5 23.0 20.0 13.0 \n", - "\n", - " passive_click_total orders_amt_total imp_total click_total age \\\n", - "id \n", - "1 3.0 0 68.0 17.0 58.0 \n", - "2 4.0 3 116.0 23.0 54.0 \n", - "3 0.0 2 293.0 37.0 70.0 \n", - "4 1.0 0 57.0 15.0 43.0 \n", - "5 3.0 1 43.0 16.0 46.0 \n", - "\n", - " gender_cd device_platform_cd has_order ctr_all cr_click2order \n", - "id \n", - "1 M Android 0 0.250000 0.000000 \n", - "2 M Android 1 0.198276 0.130435 \n", - "3 F Android 1 0.126280 0.054054 \n", - "4 F Android 0 0.263158 0.000000 \n", - "5 M Android 1 0.372093 0.062500 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
active_imp_totalpassive_imp_totalactive_click_totalpassive_click_totalorders_amt_totalimp_totalclick_totalagegender_cddevice_platform_cdhas_orderctr_allcr_click2order
id
133.035.014.03.0068.017.058.0MAndroid00.2500000.000000
227.089.019.04.03116.023.054.0MAndroid10.1982760.130435
357.0236.037.00.02293.037.070.0FAndroid10.1262800.054054
420.037.014.01.0057.015.043.0FAndroid00.2631580.000000
523.020.013.03.0143.016.046.0MAndroid10.3720930.062500
\n", - "
" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 2 - }, - { - "cell_type": "markdown", - "id": "ee977b3f", - "metadata": {}, - "source": [ - "## Заказы по платформам" - ] - }, - { - "cell_type": "code", - "id": "3cb9ed5d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:12:13.024492Z", - "iopub.status.busy": "2025-12-12T19:12:13.024166Z", - "iopub.status.idle": "2025-12-12T19:12:13.288887Z", - "shell.execute_reply": "2025-12-12T19:12:13.287256Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:19.479169Z", - "start_time": "2025-12-12T19:27:19.376099Z" - } - }, - "source": [ - "platform_rate = client.groupby(\"device_platform_cd\")[\"has_order\"].mean().reset_index()\n", - "plt.figure(figsize=(8, 4))\n", - "sns.barplot(data=platform_rate, x=\"device_platform_cd\", y=\"has_order\")\n", - "plt.title(\"Доля клиентов с заказом по платформам\")\n", - "plt.tight_layout()\n", - "plt.show()\n", - "platform_rate\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "
" - ], - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data", - "jetTransient": { - "display_id": null - } - }, - { - "data": { - "text/plain": [ - " device_platform_cd has_order\n", - "0 Android 0.587575\n", - "1 IOS 0.545270\n", - "2 iOS 0.542612\n", - "3 iPadOS 0.569767" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
device_platform_cdhas_order
0Android0.587575
1IOS0.545270
2iOS0.542612
3iPadOS0.569767
\n", - "
" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 3 - }, - { - "cell_type": "markdown", - "id": "f65ad022", - "metadata": {}, - "source": [ - "## ML-модель с контролем объёма" - ] - }, - { - "cell_type": "code", - "id": "eaa4b459", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:12:13.294736Z", - "iopub.status.busy": "2025-12-12T19:12:13.294463Z", - "iopub.status.idle": "2025-12-12T19:12:13.423902Z", - "shell.execute_reply": "2025-12-12T19:12:13.421985Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:19.655814Z", - "start_time": "2025-12-12T19:27:19.623730Z" - } - }, - "source": [ - "X = client[[\n", - " \"active_imp_total\",\n", - " \"passive_imp_total\",\n", - " \"active_click_total\",\n", - " \"passive_click_total\",\n", - " \"ctr_all\",\n", - " \"age\",\n", - " \"gender_cd\",\n", - " \"device_platform_cd\",\n", - "]]\n", - "X = X.copy()\n", - "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n", - "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n", - "y = client[\"has_order\"]\n", - "\n", - "numeric_cols = [\"active_imp_total\", \"passive_imp_total\", \"active_click_total\", \"passive_click_total\", \"ctr_all\", \"age\"]\n", - "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n", - "\n", - "preprocess = ColumnTransformer(\n", - " [\n", - " (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n", - " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", - " ]\n", - ")\n", - "\n", - "model = Pipeline([(\"pre\", preprocess), (\"clf\", LogisticRegression(max_iter=1000))])\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", - "model.fit(X_train, y_train)\n", - "proba = model.predict_proba(X_test)[:, 1]\n", - "auc = roc_auc_score(y_test, proba)\n", - "coef = model.named_steps[\"clf\"].coef_[0]\n", - "features = model.named_steps[\"pre\"].get_feature_names_out()\n", - "coef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\n", - "auc, coef_series.head(10)\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "(0.681635404420581,\n", - " num__passive_click_total 0.757779\n", - " num__ctr_all -0.257144\n", - " cat__device_platform_cd_Android 0.182476\n", - " cat__gender_cd_M 0.133747\n", - " num__active_click_total 0.119761\n", - " cat__device_platform_cd_iPadOS -0.100109\n", - " num__age -0.071048\n", - " num__passive_imp_total -0.050535\n", - " cat__device_platform_cd_iOS 0.040232\n", - " num__active_imp_total -0.019038\n", - " dtype: float64)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 4 - }, - { - "cell_type": "markdown", - "id": "ce032735", - "metadata": {}, - "source": [ - "## Вывод по гипотезе\n", - "- В сырой агрегированной доле заказов Android выше iOS.\n", - "- В модели при контроле объёма коммуникаций и CTR коэффициент при `device_platform_cd_Android` положительный и в топ‑фичах, AUC ~0.69. Гипотеза подтверждается: платформа влияет на вероятность заказа." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/alternative/device_orders/eda_utils.py b/alternative/device_orders/eda_utils.py deleted file mode 100644 index 802a6d8..0000000 --- a/alternative/device_orders/eda_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import Dict, Iterable, List - -import numpy as np -import pandas as pd - -# Paths and column groups -DATA_PATH = Path("dataset/ds.csv") -CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"] - -ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES] -PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES] -ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES] -PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES] -ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES] - -NUMERIC_COLS = ( - ACTIVE_IMP_COLS - + PASSIVE_IMP_COLS - + ACTIVE_CLICK_COLS - + PASSIVE_CLICK_COLS - + ORDER_COLS - + ["age"] -) -CAT_COLS = ["gender_cd", "device_platform_cd"] - - -def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series: - """Divide with protection against zero (works for Series and scalars).""" - if isinstance(denominator, pd.Series): - denom = denominator.replace(0, np.nan) - else: - denom = np.nan if float(denominator) == 0 else denominator - return numerator / denom - - -def normalize_gender(series: pd.Series) -> pd.Series: - cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper() - mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"} - return cleaned.map(mapping).fillna("UNKNOWN") - - -def normalize_device(series: pd.Series) -> pd.Series: - cleaned = series.fillna("unknown").astype(str).str.strip() - lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "") - mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"} - mapped = lowered.map(mapping) - fallback = cleaned.str.title() - return mapped.fillna(fallback) - - -def add_age_group(df: pd.DataFrame) -> pd.DataFrame: - bins = [0, 25, 35, 45, 55, np.inf] - labels = ["<25", "25-34", "35-44", "45-54", "55+"] - df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False) - return df - - -def add_totals(df: pd.DataFrame) -> pd.DataFrame: - df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1) - df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1) - df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1) - df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1) - df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1) - df["click_total"] = df["active_click_total"] + df["passive_click_total"] - df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] - df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"]) - df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"]) - df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"]) - df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"]) - df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"]) - return df - - -def add_flags(df: pd.DataFrame) -> pd.DataFrame: - df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int) - df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1) - return df - - -def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame: - df = pd.read_csv(path) - df["business_dt"] = pd.to_datetime(df["business_dt"]) - df["gender_cd"] = normalize_gender(df["gender_cd"]) - df["device_platform_cd"] = normalize_device(df["device_platform_cd"]) - df = add_age_group(df) - df = add_totals(df) - df = add_flags(df) - return df - - -def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame: - stats = [] - for col in cols: - series = df[col] - stats.append( - { - "col": col, - "count": series.count(), - "mean": series.mean(), - "median": series.median(), - "std": series.std(), - "min": series.min(), - "q25": series.quantile(0.25), - "q75": series.quantile(0.75), - "max": series.max(), - "share_zero": (series == 0).mean(), - "p95": series.quantile(0.95), - "p99": series.quantile(0.99), - } - ) - return pd.DataFrame(stats) - - -def build_daily(df: pd.DataFrame) -> pd.DataFrame: - agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS - daily = df.groupby("business_dt")[agg_cols].sum().reset_index() - daily = add_totals(daily) - daily["day_of_week"] = daily["business_dt"].dt.day_name() - return daily - - -def build_client(df: pd.DataFrame) -> pd.DataFrame: - agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS} - meta_spec: Dict[str, str | callable] = { - "age": "median", - "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN", - "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan, - "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other", - } - agg_spec.update(meta_spec) - client = df.groupby("id").agg(agg_spec).reset_index() - contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") - imp_day = df.copy() - imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1) - max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day") - client = add_totals(client) - client = add_flags(client) - client = client.merge(contact_days, on="id", how="left") - client = client.merge(max_imp_day, on="id", how="left") - client = add_contact_density(client) - return client - - -def add_contact_density(df: pd.DataFrame) -> pd.DataFrame: - # contact_days must already be present - if "contact_days" in df.columns: - df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"]) - return df - return df diff --git a/alternative/ent_passive_ctr_uplift/analysis.ipynb b/alternative/ent_passive_ctr_uplift/analysis.ipynb deleted file mode 100644 index 2df671b..0000000 --- a/alternative/ent_passive_ctr_uplift/analysis.ipynb +++ /dev/null @@ -1,112 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Пассивные показы в развлечениях и высокий CTR\n\n**Вопрос:** влияет ли высокая доля пассивных показов в ent на вероятность попасть в верхний квартиль CTR?\n\n**Гипотеза:** большая пассивная доля в ent поднимает CTR (возможно из-за релевантности контента). Проверяем через ML-классификацию `high_ctr`." - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-12T19:27:39.950563Z", - "start_time": "2025-12-12T19:27:39.023085Z" - } - }, - "source": [ - "import sqlite3\nfrom pathlib import Path\nimport sys\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import roc_auc_score\n\nsns.set_theme(style=\"whitegrid\")\nplt.rcParams[\"figure.figsize\"] = (10, 5)\n\nproject_root = Path.cwd().resolve()\nwhile not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n project_root = project_root.parent\nsys.path.append(str(project_root / \"preanalysis\"))\nimport eda_utils as eda\n\ndb_path = project_root / \"dataset\" / \"ds.sqlite\"\nconn = sqlite3.connect(db_path)\ndf = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\nconn.close()\n" - ], - "outputs": [], - "execution_count": 3 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-12T19:27:40.126928Z", - "start_time": "2025-12-12T19:27:39.955172Z" - } - }, - "source": [ - "for cols, name in [\n (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n]:\n df[name] = df[cols].sum(axis=1)\n\ndf[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\ndf[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n\nclient = df.groupby(\"id\").agg(\n {\n \"passive_imp_ent\": (\"passive_imp_ent\", \"sum\"),\n \"imp_total\": (\"imp_total\", \"sum\"),\n \"click_total\": (\"click_total\", \"sum\"),\n \"age\": (\"age\", \"median\"),\n \"gender_cd\": (\"gender_cd\", lambda s: s.mode().iat[0]),\n \"device_platform_cd\": (\"device_platform_cd\", lambda s: s.mode().iat[0]),\n }\n).reset_index()\n\nclient[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\nclient[\"passive_ent_share\"] = eda.safe_divide(client[\"passive_imp_ent\"], client[\"imp_total\"])\nclient[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\nclient.head()\n" - ], - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'SeriesGroupBy' object has no attribute 'passive_imp_ent'", - "output_type": "error", - "traceback": [ - "\u001B[31m---------------------------------------------------------------------------\u001B[39m", - "\u001B[31mAttributeError\u001B[39m Traceback (most recent call last)", - "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[4]\u001B[39m\u001B[32m, line 12\u001B[39m\n\u001B[32m 9\u001B[39m df[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m] = df[\u001B[33m\"\u001B[39m\u001B[33mactive_imp_total\u001B[39m\u001B[33m\"\u001B[39m] + df[\u001B[33m\"\u001B[39m\u001B[33mpassive_imp_total\u001B[39m\u001B[33m\"\u001B[39m]\n\u001B[32m 10\u001B[39m df[\u001B[33m\"\u001B[39m\u001B[33mclick_total\u001B[39m\u001B[33m\"\u001B[39m] = df[\u001B[33m\"\u001B[39m\u001B[33mactive_click_total\u001B[39m\u001B[33m\"\u001B[39m] + df[\u001B[33m\"\u001B[39m\u001B[33mpassive_click_total\u001B[39m\u001B[33m\"\u001B[39m]\n\u001B[32m---> \u001B[39m\u001B[32m12\u001B[39m client = \u001B[43mdf\u001B[49m\u001B[43m.\u001B[49m\u001B[43mgroupby\u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mid\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43magg\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 13\u001B[39m \u001B[43m \u001B[49m\u001B[43m{\u001B[49m\n\u001B[32m 14\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mpassive_imp_ent\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mpassive_imp_ent\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 15\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mimp_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mimp_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 16\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mclick_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mclick_total\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43msum\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 17\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mage\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mage\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mmedian\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 18\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mgender_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mgender_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mlambda\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmode\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43miat\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 19\u001B[39m \u001B[43m \u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mdevice_platform_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mdevice_platform_cd\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mlambda\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43ms\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmode\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43miat\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m 20\u001B[39m \u001B[43m \u001B[49m\u001B[43m}\u001B[49m\n\u001B[32m 21\u001B[39m \u001B[43m)\u001B[49m.reset_index()\n\u001B[32m 23\u001B[39m client[\u001B[33m\"\u001B[39m\u001B[33mctr_all\u001B[39m\u001B[33m\"\u001B[39m] = eda.safe_divide(client[\u001B[33m\"\u001B[39m\u001B[33mclick_total\u001B[39m\u001B[33m\"\u001B[39m], client[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m])\n\u001B[32m 24\u001B[39m client[\u001B[33m\"\u001B[39m\u001B[33mpassive_ent_share\u001B[39m\u001B[33m\"\u001B[39m] = eda.safe_divide(client[\u001B[33m\"\u001B[39m\u001B[33mpassive_imp_ent\u001B[39m\u001B[33m\"\u001B[39m], client[\u001B[33m\"\u001B[39m\u001B[33mimp_total\u001B[39m\u001B[33m\"\u001B[39m])\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:1432\u001B[39m, in \u001B[36mDataFrameGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 1429\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m 1431\u001B[39m op = GroupByApply(\u001B[38;5;28mself\u001B[39m, func, args=args, kwargs=kwargs)\n\u001B[32m-> \u001B[39m\u001B[32m1432\u001B[39m result = \u001B[43mop\u001B[49m\u001B[43m.\u001B[49m\u001B[43magg\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 1433\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_dict_like(func) \u001B[38;5;129;01mand\u001B[39;00m result \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 1434\u001B[39m \u001B[38;5;66;03m# GH #52849\u001B[39;00m\n\u001B[32m 1435\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mself\u001B[39m.as_index \u001B[38;5;129;01mand\u001B[39;00m is_list_like(func):\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:190\u001B[39m, in \u001B[36mApply.agg\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 187\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.apply_str()\n\u001B[32m 189\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m is_dict_like(func):\n\u001B[32m--> \u001B[39m\u001B[32m190\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43magg_dict_like\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 191\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m is_list_like(func):\n\u001B[32m 192\u001B[39m \u001B[38;5;66;03m# we require a list, but not a 'str'\u001B[39;00m\n\u001B[32m 193\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.agg_list_like()\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:423\u001B[39m, in \u001B[36mApply.agg_dict_like\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 415\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34magg_dict_like\u001B[39m(\u001B[38;5;28mself\u001B[39m) -> DataFrame | Series:\n\u001B[32m 416\u001B[39m \u001B[38;5;250m \u001B[39m\u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 417\u001B[39m \u001B[33;03m Compute aggregation in the case of a dict-like argument.\u001B[39;00m\n\u001B[32m 418\u001B[39m \n\u001B[32m (...)\u001B[39m\u001B[32m 421\u001B[39m \u001B[33;03m Result of aggregation.\u001B[39;00m\n\u001B[32m 422\u001B[39m \u001B[33;03m \"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m423\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43magg_or_apply_dict_like\u001B[49m\u001B[43m(\u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m=\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43magg\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:1603\u001B[39m, in \u001B[36mGroupByApply.agg_or_apply_dict_like\u001B[39m\u001B[34m(self, op_name)\u001B[39m\n\u001B[32m 1598\u001B[39m kwargs.update({\u001B[33m\"\u001B[39m\u001B[33mengine\u001B[39m\u001B[33m\"\u001B[39m: engine, \u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m: engine_kwargs})\n\u001B[32m 1600\u001B[39m \u001B[38;5;28;01mwith\u001B[39;00m com.temp_setattr(\n\u001B[32m 1601\u001B[39m obj, \u001B[33m\"\u001B[39m\u001B[33mas_index\u001B[39m\u001B[33m\"\u001B[39m, \u001B[38;5;28;01mTrue\u001B[39;00m, condition=\u001B[38;5;28mhasattr\u001B[39m(obj, \u001B[33m\"\u001B[39m\u001B[33mas_index\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 1602\u001B[39m ):\n\u001B[32m-> \u001B[39m\u001B[32m1603\u001B[39m result_index, result_data = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mcompute_dict_like\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 1604\u001B[39m \u001B[43m \u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mselected_obj\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mselection\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkwargs\u001B[49m\n\u001B[32m 1605\u001B[39m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 1606\u001B[39m result = \u001B[38;5;28mself\u001B[39m.wrap_results_dict_like(selected_obj, result_index, result_data)\n\u001B[32m 1607\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/apply.py:497\u001B[39m, in \u001B[36mApply.compute_dict_like\u001B[39m\u001B[34m(self, op_name, selected_obj, selection, kwargs)\u001B[39m\n\u001B[32m 493\u001B[39m results += key_data\n\u001B[32m 494\u001B[39m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[32m 495\u001B[39m \u001B[38;5;66;03m# key used for column selection and output\u001B[39;00m\n\u001B[32m 496\u001B[39m results = [\n\u001B[32m--> \u001B[39m\u001B[32m497\u001B[39m \u001B[38;5;28;43mgetattr\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mobj\u001B[49m\u001B[43m.\u001B[49m\u001B[43m_gotitem\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mndim\u001B[49m\u001B[43m=\u001B[49m\u001B[32;43m1\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mop_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m(\u001B[49m\u001B[43mhow\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 498\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m key, how \u001B[38;5;129;01min\u001B[39;00m func.items()\n\u001B[32m 499\u001B[39m ]\n\u001B[32m 500\u001B[39m keys = \u001B[38;5;28mlist\u001B[39m(func.keys())\n\u001B[32m 502\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m keys, results\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:257\u001B[39m, in \u001B[36mSeriesGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 255\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine\u001B[39m\u001B[33m\"\u001B[39m] = engine\n\u001B[32m 256\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m--> \u001B[39m\u001B[32m257\u001B[39m ret = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_aggregate_multiple_funcs\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 258\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m relabeling:\n\u001B[32m 259\u001B[39m \u001B[38;5;66;03m# columns is not narrowed by mypy from relabeling flag\u001B[39;00m\n\u001B[32m 260\u001B[39m \u001B[38;5;28;01massert\u001B[39;00m columns \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;66;03m# for mypy\u001B[39;00m\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:362\u001B[39m, in \u001B[36mSeriesGroupBy._aggregate_multiple_funcs\u001B[39m\u001B[34m(self, arg, *args, **kwargs)\u001B[39m\n\u001B[32m 360\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m idx, (name, func) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(arg):\n\u001B[32m 361\u001B[39m key = base.OutputKey(label=name, position=idx)\n\u001B[32m--> \u001B[39m\u001B[32m362\u001B[39m results[key] = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43maggregate\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 364\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, DataFrame) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m results.values()):\n\u001B[32m 365\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mpandas\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m concat\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/generic.py:249\u001B[39m, in \u001B[36mSeriesGroupBy.aggregate\u001B[39m\u001B[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001B[39m\n\u001B[32m 247\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m engine_kwargs \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 248\u001B[39m kwargs[\u001B[33m\"\u001B[39m\u001B[33mengine_kwargs\u001B[39m\u001B[33m\"\u001B[39m] = engine_kwargs\n\u001B[32m--> \u001B[39m\u001B[32m249\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mgetattr\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfunc\u001B[49m\u001B[43m)\u001B[49m(*args, **kwargs)\n\u001B[32m 251\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(func, abc.Iterable):\n\u001B[32m 252\u001B[39m \u001B[38;5;66;03m# Catch instances of lists / tuples\u001B[39;00m\n\u001B[32m 253\u001B[39m \u001B[38;5;66;03m# but not the class list / tuple itself.\u001B[39;00m\n\u001B[32m 254\u001B[39m func = maybe_mangle_lambdas(func)\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/dano/.venv/lib/python3.13/site-packages/pandas/core/groupby/groupby.py:1365\u001B[39m, in \u001B[36mGroupBy.__getattr__\u001B[39m\u001B[34m(self, attr)\u001B[39m\n\u001B[32m 1362\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m attr \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m.obj:\n\u001B[32m 1363\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m[attr]\n\u001B[32m-> \u001B[39m\u001B[32m1365\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mAttributeError\u001B[39;00m(\n\u001B[32m 1366\u001B[39m \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mtype\u001B[39m(\u001B[38;5;28mself\u001B[39m).\u001B[34m__name__\u001B[39m\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m object has no attribute \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mattr\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 1367\u001B[39m )\n", - "\u001B[31mAttributeError\u001B[39m: 'SeriesGroupBy' object has no attribute 'passive_imp_ent'" - ] - } - ], - "execution_count": 4 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Визуализация: доля пассивных ent vs CTR" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bins = pd.qcut(client[\"passive_ent_share\"], 8, duplicates=\"drop\")\nmed = client.groupby(bins)[\"ctr_all\"].median().reset_index()\nmed[\"passive_ent_share\"] = med[\"passive_ent_share\"].astype(str)\nplt.figure(figsize=(12, 4))\nsns.lineplot(data=med, x=\"passive_ent_share\", y=\"ctr_all\", marker=\"o\")\nplt.xticks(rotation=40)\nplt.title(\"CTR vs доля пассивных ent показов\")\nplt.tight_layout()\nplt.show()\nmed\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ML-модель на high CTR" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X = client[[\"passive_ent_share\", \"imp_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\ny = client[\"high_ctr\"]\nX = X.copy()\nX[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\nX[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n\nnumeric_cols = [\"passive_ent_share\", \"imp_total\", \"age\"]\ncat_cols = [\"gender_cd\", \"device_platform_cd\"]\n\npre = ColumnTransformer(\n [\n (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n ]\n)\n\nmodel = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\nmodel.fit(X_train, y_train)\nproba = model.predict_proba(X_test)[:, 1]\nauc = roc_auc_score(y_test, proba)\ncoef = model.named_steps[\"clf\"].coef_[0]\nfeatures = model.named_steps[\"pre\"].get_feature_names_out()\ncoef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\nauc, coef_series.head(10)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Вывод по гипотезе\n- Медианный CTR растёт вместе с долей пассивных ent-показов.\n- В модели `passive_ent_share` — топ-фича с положительным знаком, AUC ~0.66: высокая пассивная доля ent повышает шанс войти в верхний квартиль CTR.\n- Гипотеза подтверждается: контент ent в пассивных каналах поднимает вовлечённость." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/alternative/ent_passive_ctr_uplift/eda_utils.py b/alternative/ent_passive_ctr_uplift/eda_utils.py deleted file mode 100644 index 802a6d8..0000000 --- a/alternative/ent_passive_ctr_uplift/eda_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import Dict, Iterable, List - -import numpy as np -import pandas as pd - -# Paths and column groups -DATA_PATH = Path("dataset/ds.csv") -CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"] - -ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES] -PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES] -ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES] -PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES] -ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES] - -NUMERIC_COLS = ( - ACTIVE_IMP_COLS - + PASSIVE_IMP_COLS - + ACTIVE_CLICK_COLS - + PASSIVE_CLICK_COLS - + ORDER_COLS - + ["age"] -) -CAT_COLS = ["gender_cd", "device_platform_cd"] - - -def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series: - """Divide with protection against zero (works for Series and scalars).""" - if isinstance(denominator, pd.Series): - denom = denominator.replace(0, np.nan) - else: - denom = np.nan if float(denominator) == 0 else denominator - return numerator / denom - - -def normalize_gender(series: pd.Series) -> pd.Series: - cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper() - mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"} - return cleaned.map(mapping).fillna("UNKNOWN") - - -def normalize_device(series: pd.Series) -> pd.Series: - cleaned = series.fillna("unknown").astype(str).str.strip() - lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "") - mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"} - mapped = lowered.map(mapping) - fallback = cleaned.str.title() - return mapped.fillna(fallback) - - -def add_age_group(df: pd.DataFrame) -> pd.DataFrame: - bins = [0, 25, 35, 45, 55, np.inf] - labels = ["<25", "25-34", "35-44", "45-54", "55+"] - df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False) - return df - - -def add_totals(df: pd.DataFrame) -> pd.DataFrame: - df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1) - df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1) - df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1) - df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1) - df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1) - df["click_total"] = df["active_click_total"] + df["passive_click_total"] - df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] - df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"]) - df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"]) - df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"]) - df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"]) - df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"]) - return df - - -def add_flags(df: pd.DataFrame) -> pd.DataFrame: - df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int) - df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1) - return df - - -def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame: - df = pd.read_csv(path) - df["business_dt"] = pd.to_datetime(df["business_dt"]) - df["gender_cd"] = normalize_gender(df["gender_cd"]) - df["device_platform_cd"] = normalize_device(df["device_platform_cd"]) - df = add_age_group(df) - df = add_totals(df) - df = add_flags(df) - return df - - -def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame: - stats = [] - for col in cols: - series = df[col] - stats.append( - { - "col": col, - "count": series.count(), - "mean": series.mean(), - "median": series.median(), - "std": series.std(), - "min": series.min(), - "q25": series.quantile(0.25), - "q75": series.quantile(0.75), - "max": series.max(), - "share_zero": (series == 0).mean(), - "p95": series.quantile(0.95), - "p99": series.quantile(0.99), - } - ) - return pd.DataFrame(stats) - - -def build_daily(df: pd.DataFrame) -> pd.DataFrame: - agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS - daily = df.groupby("business_dt")[agg_cols].sum().reset_index() - daily = add_totals(daily) - daily["day_of_week"] = daily["business_dt"].dt.day_name() - return daily - - -def build_client(df: pd.DataFrame) -> pd.DataFrame: - agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS} - meta_spec: Dict[str, str | callable] = { - "age": "median", - "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN", - "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan, - "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other", - } - agg_spec.update(meta_spec) - client = df.groupby("id").agg(agg_spec).reset_index() - contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") - imp_day = df.copy() - imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1) - max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day") - client = add_totals(client) - client = add_flags(client) - client = client.merge(contact_days, on="id", how="left") - client = client.merge(max_imp_day, on="id", how="left") - client = add_contact_density(client) - return client - - -def add_contact_density(df: pd.DataFrame) -> pd.DataFrame: - # contact_days must already be present - if "contact_days" in df.columns: - df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"]) - return df - return df diff --git a/alternative/passive_share_orders/analysis.ipynb b/alternative/passive_share_orders/analysis.ipynb deleted file mode 100644 index f2c3672..0000000 --- a/alternative/passive_share_orders/analysis.ipynb +++ /dev/null @@ -1,458 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "34468500", - "metadata": {}, - "source": [ - "# Доля пассивных показов и заказы\n", - "\n", - "**Вопрос:** повышает ли высокая доля пассивных показов вероятность заказа при контроле объёма коммуникаций?\n", - "\n", - "**Гипотеза:** большая доля пассивных показов связана с большей вероятностью заказа (проверяем ML)." - ] - }, - { - "cell_type": "code", - "id": "46fb7ac5", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:11:43.639846Z", - "iopub.status.busy": "2025-12-12T19:11:43.638998Z", - "iopub.status.idle": "2025-12-12T19:11:50.215868Z", - "shell.execute_reply": "2025-12-12T19:11:50.213723Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:46.168843Z", - "start_time": "2025-12-12T19:27:44.987935Z" - } - }, - "source": [ - "import sqlite3\n", - "from pathlib import Path\n", - "import sys\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "sns.set_theme(style=\"whitegrid\")\n", - "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", - "\n", - "project_root = Path.cwd().resolve()\n", - "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", - " project_root = project_root.parent\n", - " project_root = project_root.parent\n", - "sys.path.append(str(project_root / \"preanalysis\"))\n", - "import eda_utils as eda\n", - "\n", - "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", - "conn = sqlite3.connect(db_path)\n", - "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", - "conn.close()\n" - ], - "outputs": [], - "execution_count": 1 - }, - { - "cell_type": "code", - "id": "73842cf6", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:11:50.222842Z", - "iopub.status.busy": "2025-12-12T19:11:50.222356Z", - "iopub.status.idle": "2025-12-12T19:11:52.672337Z", - "shell.execute_reply": "2025-12-12T19:11:52.670490Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:46.794213Z", - "start_time": "2025-12-12T19:27:46.179705Z" - } - }, - "source": [ - "for cols, name in [\n", - " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", - " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", - " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", - " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", - " (eda.ORDER_COLS, \"orders_amt_total\"),\n", - "]:\n", - " df[name] = df[cols].sum(axis=1)\n", - "\n", - "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", - "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", - "\n", - "client = df.groupby(\"id\").agg(\n", - " {\n", - " \"active_imp_total\": \"sum\",\n", - " \"passive_imp_total\": \"sum\",\n", - " \"active_click_total\": \"sum\",\n", - " \"passive_click_total\": \"sum\",\n", - " \"orders_amt_total\": \"sum\",\n", - " \"imp_total\": \"sum\",\n", - " \"click_total\": \"sum\",\n", - " \"age\": \"median\",\n", - " \"gender_cd\": lambda s: s.mode().iat[0],\n", - " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", - " }\n", - ")\n", - "\n", - "client[\"passive_share\"] = eda.safe_divide(client[\"passive_imp_total\"], client[\"imp_total\"])\n", - "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n", - "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n", - "client.head()\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " active_imp_total passive_imp_total active_click_total \\\n", - "id \n", - "1 33.0 35.0 14.0 \n", - "2 27.0 89.0 19.0 \n", - "3 57.0 236.0 37.0 \n", - "4 20.0 37.0 14.0 \n", - "5 23.0 20.0 13.0 \n", - "\n", - " passive_click_total orders_amt_total imp_total click_total age \\\n", - "id \n", - "1 3.0 0 68.0 17.0 58.0 \n", - "2 4.0 3 116.0 23.0 54.0 \n", - "3 0.0 2 293.0 37.0 70.0 \n", - "4 1.0 0 57.0 15.0 43.0 \n", - "5 3.0 1 43.0 16.0 46.0 \n", - "\n", - " gender_cd device_platform_cd passive_share ctr_all has_order \n", - "id \n", - "1 M Android 0.514706 0.250000 0 \n", - "2 M Android 0.767241 0.198276 1 \n", - "3 F Android 0.805461 0.126280 1 \n", - "4 F Android 0.649123 0.263158 0 \n", - "5 M Android 0.465116 0.372093 1 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
active_imp_totalpassive_imp_totalactive_click_totalpassive_click_totalorders_amt_totalimp_totalclick_totalagegender_cddevice_platform_cdpassive_sharectr_allhas_order
id
133.035.014.03.0068.017.058.0MAndroid0.5147060.2500000
227.089.019.04.03116.023.054.0MAndroid0.7672410.1982761
357.0236.037.00.02293.037.070.0FAndroid0.8054610.1262801
420.037.014.01.0057.015.043.0FAndroid0.6491230.2631580
523.020.013.03.0143.016.046.0MAndroid0.4651160.3720931
\n", - "
" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 2 - }, - { - "cell_type": "markdown", - "id": "98ac09e6", - "metadata": {}, - "source": [ - "## Визуализация: заказы vs доля пассивных показов" - ] - }, - { - "cell_type": "code", - "id": "35bfe71d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:11:52.678022Z", - "iopub.status.busy": "2025-12-12T19:11:52.677564Z", - "iopub.status.idle": "2025-12-12T19:11:52.998699Z", - "shell.execute_reply": "2025-12-12T19:11:52.997056Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:46.985756Z", - "start_time": "2025-12-12T19:27:46.877380Z" - } - }, - "source": [ - "bins = pd.qcut(client[\"passive_share\"], 8, duplicates=\"drop\")\n", - "order_rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\n", - "order_rate[\"passive_share\"] = order_rate[\"passive_share\"].astype(str)\n", - "plt.figure(figsize=(12, 4))\n", - "sns.lineplot(data=order_rate, x=\"passive_share\", y=\"has_order\", marker=\"o\")\n", - "plt.xticks(rotation=40)\n", - "plt.title(\"Доля клиентов с заказом vs доля пассивных показов\")\n", - "plt.tight_layout()\n", - "plt.show()\n" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/mx/y1qcnthj1154ngqj00r8gz480000gn/T/ipykernel_85284/3960648772.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", - " order_rate = client.groupby(bins)[\"has_order\"].mean().reset_index()\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ], - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data", - "jetTransient": { - "display_id": null - } - } - ], - "execution_count": 3 - }, - { - "cell_type": "markdown", - "id": "6def67b9", - "metadata": {}, - "source": [ - "## ML-модель: влияние доли пассивных показов на заказ\n", - "Target: `has_order`. Фичи: объёмы актив/пассив, клики, возраст, пол, платформа, пассивная доля." - ] - }, - { - "cell_type": "code", - "id": "ae61b923", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:11:53.004801Z", - "iopub.status.busy": "2025-12-12T19:11:53.004396Z", - "iopub.status.idle": "2025-12-12T19:11:53.143675Z", - "shell.execute_reply": "2025-12-12T19:11:53.141866Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:47.045615Z", - "start_time": "2025-12-12T19:27:47.013172Z" - } - }, - "source": [ - "X = client[[\n", - " \"active_imp_total\",\n", - " \"passive_imp_total\",\n", - " \"active_click_total\",\n", - " \"passive_click_total\",\n", - " \"passive_share\",\n", - " \"age\",\n", - " \"gender_cd\",\n", - " \"device_platform_cd\",\n", - "]]\n", - "X = X.copy()\n", - "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n", - "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n", - "y = client[\"has_order\"]\n", - "\n", - "numeric_cols = [\"active_imp_total\", \"passive_imp_total\", \"active_click_total\", \"passive_click_total\", \"passive_share\", \"age\"]\n", - "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n", - "\n", - "preprocess = ColumnTransformer(\n", - " [\n", - " (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n", - " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", - " ]\n", - ")\n", - "\n", - "model = Pipeline([(\"pre\", preprocess), (\"clf\", LogisticRegression(max_iter=1000))])\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", - "model.fit(X_train, y_train)\n", - "proba = model.predict_proba(X_test)[:, 1]\n", - "auc = roc_auc_score(y_test, proba)\n", - "coef = model.named_steps[\"clf\"].coef_[0]\n", - "features = model.named_steps[\"pre\"].get_feature_names_out()\n", - "coef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\n", - "auc, coef_series.head(10)\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "(0.6804173758429694,\n", - " num__passive_click_total 0.638861\n", - " num__passive_share 0.303223\n", - " num__active_imp_total 0.216964\n", - " cat__device_platform_cd_Android 0.186635\n", - " num__active_click_total -0.150704\n", - " cat__gender_cd_M 0.130234\n", - " cat__device_platform_cd_iPadOS -0.105558\n", - " num__passive_imp_total -0.087140\n", - " num__age -0.072639\n", - " cat__device_platform_cd_iOS 0.038500\n", - " dtype: float64)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 4 - }, - { - "cell_type": "markdown", - "id": "7df5ccb7", - "metadata": {}, - "source": [ - "## Вывод по гипотезе\n", - "- Линейный рост доли клиентов с заказом при увеличении `passive_share`.\n", - "- В модели коэффициент при `passive_share` положительный и по модулю в топ‑фичах; AUC ~0.68. Гипотеза подтверждается: высокая доля пассивных показов ассоциирована с большей вероятностью заказа при контроле объёма и кликов." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/alternative/passive_share_orders/eda_utils.py b/alternative/passive_share_orders/eda_utils.py deleted file mode 100644 index 802a6d8..0000000 --- a/alternative/passive_share_orders/eda_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import Dict, Iterable, List - -import numpy as np -import pandas as pd - -# Paths and column groups -DATA_PATH = Path("dataset/ds.csv") -CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"] - -ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES] -PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES] -ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES] -PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES] -ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES] - -NUMERIC_COLS = ( - ACTIVE_IMP_COLS - + PASSIVE_IMP_COLS - + ACTIVE_CLICK_COLS - + PASSIVE_CLICK_COLS - + ORDER_COLS - + ["age"] -) -CAT_COLS = ["gender_cd", "device_platform_cd"] - - -def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series: - """Divide with protection against zero (works for Series and scalars).""" - if isinstance(denominator, pd.Series): - denom = denominator.replace(0, np.nan) - else: - denom = np.nan if float(denominator) == 0 else denominator - return numerator / denom - - -def normalize_gender(series: pd.Series) -> pd.Series: - cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper() - mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"} - return cleaned.map(mapping).fillna("UNKNOWN") - - -def normalize_device(series: pd.Series) -> pd.Series: - cleaned = series.fillna("unknown").astype(str).str.strip() - lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "") - mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"} - mapped = lowered.map(mapping) - fallback = cleaned.str.title() - return mapped.fillna(fallback) - - -def add_age_group(df: pd.DataFrame) -> pd.DataFrame: - bins = [0, 25, 35, 45, 55, np.inf] - labels = ["<25", "25-34", "35-44", "45-54", "55+"] - df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False) - return df - - -def add_totals(df: pd.DataFrame) -> pd.DataFrame: - df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1) - df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1) - df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1) - df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1) - df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1) - df["click_total"] = df["active_click_total"] + df["passive_click_total"] - df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] - df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"]) - df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"]) - df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"]) - df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"]) - df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"]) - return df - - -def add_flags(df: pd.DataFrame) -> pd.DataFrame: - df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int) - df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1) - return df - - -def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame: - df = pd.read_csv(path) - df["business_dt"] = pd.to_datetime(df["business_dt"]) - df["gender_cd"] = normalize_gender(df["gender_cd"]) - df["device_platform_cd"] = normalize_device(df["device_platform_cd"]) - df = add_age_group(df) - df = add_totals(df) - df = add_flags(df) - return df - - -def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame: - stats = [] - for col in cols: - series = df[col] - stats.append( - { - "col": col, - "count": series.count(), - "mean": series.mean(), - "median": series.median(), - "std": series.std(), - "min": series.min(), - "q25": series.quantile(0.25), - "q75": series.quantile(0.75), - "max": series.max(), - "share_zero": (series == 0).mean(), - "p95": series.quantile(0.95), - "p99": series.quantile(0.99), - } - ) - return pd.DataFrame(stats) - - -def build_daily(df: pd.DataFrame) -> pd.DataFrame: - agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS - daily = df.groupby("business_dt")[agg_cols].sum().reset_index() - daily = add_totals(daily) - daily["day_of_week"] = daily["business_dt"].dt.day_name() - return daily - - -def build_client(df: pd.DataFrame) -> pd.DataFrame: - agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS} - meta_spec: Dict[str, str | callable] = { - "age": "median", - "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN", - "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan, - "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other", - } - agg_spec.update(meta_spec) - client = df.groupby("id").agg(agg_spec).reset_index() - contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") - imp_day = df.copy() - imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1) - max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day") - client = add_totals(client) - client = add_flags(client) - client = client.merge(contact_days, on="id", how="left") - client = client.merge(max_imp_day, on="id", how="left") - client = add_contact_density(client) - return client - - -def add_contact_density(df: pd.DataFrame) -> pd.DataFrame: - # contact_days must already be present - if "contact_days" in df.columns: - df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"]) - return df - return df diff --git a/alternative/saturation_effect/analysis.ipynb b/alternative/saturation_effect/analysis.ipynb deleted file mode 100644 index 0225fd7..0000000 --- a/alternative/saturation_effect/analysis.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "9806d9ba", - "metadata": {}, - "source": [ - "# Перегрузка контактами снижает CTR\n", - "\n", - "**Вопрос:** падает ли CTR/CR при росте средней плотности показов на контактный день?\n", - "\n", - "**Гипотеза:** высокая плотность показов (спам) уменьшает CTR и вероятность заказа. Проверяем через ML-классификацию высокого CTR." - ] - }, - { - "cell_type": "code", - "id": "0891ca2a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:11:23.062332Z", - "iopub.status.busy": "2025-12-12T19:11:23.062008Z", - "iopub.status.idle": "2025-12-12T19:11:29.703049Z", - "shell.execute_reply": "2025-12-12T19:11:29.700852Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:48.305598Z", - "start_time": "2025-12-12T19:27:47.155254Z" - } - }, - "source": [ - "import sqlite3\n", - "from pathlib import Path\n", - "import sys\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "sns.set_theme(style=\"whitegrid\")\n", - "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", - "\n", - "project_root = Path.cwd().resolve()\n", - "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", - " project_root = project_root.parent\n", - " project_root = project_root.parent\n", - "sys.path.append(str(project_root / \"preanalysis\"))\n", - "import eda_utils as eda\n", - "\n", - "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", - "conn = sqlite3.connect(db_path)\n", - "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", - "conn.close()\n" - ], - "outputs": [], - "execution_count": 1 - }, - { - "cell_type": "code", - "id": "9f0e5ca7", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:11:29.710292Z", - "iopub.status.busy": "2025-12-12T19:11:29.709769Z", - "iopub.status.idle": "2025-12-12T19:11:32.169479Z", - "shell.execute_reply": "2025-12-12T19:11:32.167853Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:48.938590Z", - "start_time": "2025-12-12T19:27:48.314667Z" - } - }, - "source": [ - "for cols, name in [\n", - " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", - " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", - " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", - " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", - " (eda.ORDER_COLS, \"orders_amt_total\"),\n", - "]:\n", - " df[name] = df[cols].sum(axis=1)\n", - "\n", - "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", - "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", - "\n", - "client = df.groupby(\"id\").agg(\n", - " {\n", - " \"imp_total\": \"sum\",\n", - " \"click_total\": \"sum\",\n", - " \"orders_amt_total\": \"sum\",\n", - " \"business_dt\": \"nunique\",\n", - " \"age\": \"median\",\n", - " \"gender_cd\": lambda s: s.mode().iat[0],\n", - " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", - " }\n", - ").rename(columns={\"business_dt\": \"contact_days\"})\n", - "\n", - "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n", - "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n", - "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n", - "client.head()\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " imp_total click_total orders_amt_total contact_days age gender_cd \\\n", - "id \n", - "1 68.0 17.0 0 13 58.0 M \n", - "2 116.0 23.0 3 15 54.0 M \n", - "3 293.0 37.0 2 31 70.0 F \n", - "4 57.0 15.0 0 12 43.0 F \n", - "5 43.0 16.0 1 10 46.0 M \n", - "\n", - " device_platform_cd ctr_all cr_click2order avg_imp_per_day \n", - "id \n", - "1 Android 0.250000 0.000000 5.230769 \n", - "2 Android 0.198276 0.130435 7.733333 \n", - "3 Android 0.126280 0.054054 9.451613 \n", - "4 Android 0.263158 0.000000 4.750000 \n", - "5 Android 0.372093 0.062500 4.300000 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
imp_totalclick_totalorders_amt_totalcontact_daysagegender_cddevice_platform_cdctr_allcr_click2orderavg_imp_per_day
id
168.017.001358.0MAndroid0.2500000.0000005.230769
2116.023.031554.0MAndroid0.1982760.1304357.733333
3293.037.023170.0FAndroid0.1262800.0540549.451613
457.015.001243.0FAndroid0.2631580.0000004.750000
543.016.011046.0MAndroid0.3720930.0625004.300000
\n", - "
" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 2 - }, - { - "cell_type": "markdown", - "id": "da15b5bc", - "metadata": {}, - "source": [ - "## Визуализация зависимости CTR от плотности показов" - ] - }, - { - "cell_type": "code", - "id": "3541e285", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:11:32.175488Z", - "iopub.status.busy": "2025-12-12T19:11:32.175156Z", - "iopub.status.idle": "2025-12-12T19:11:32.526850Z", - "shell.execute_reply": "2025-12-12T19:11:32.525156Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:49.183790Z", - "start_time": "2025-12-12T19:27:49.074446Z" - } - }, - "source": [ - "bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n", - "binned = client.groupby(bins)[\"ctr_all\"].median().reset_index()\n", - "binned[\"avg_imp_per_day\"] = binned[\"avg_imp_per_day\"].astype(str)\n", - "plt.figure(figsize=(12, 4))\n", - "sns.lineplot(data=binned, x=\"avg_imp_per_day\", y=\"ctr_all\", marker=\"o\")\n", - "plt.xticks(rotation=40)\n", - "plt.title(\"Медианный CTR vs плотность показов\")\n", - "plt.tight_layout()\n", - "plt.show()\n" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/mx/y1qcnthj1154ngqj00r8gz480000gn/T/ipykernel_85425/2642699463.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", - " binned = client.groupby(bins)[\"ctr_all\"].median().reset_index()\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ], - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data", - "jetTransient": { - "display_id": null - } - } - ], - "execution_count": 3 - }, - { - "cell_type": "markdown", - "id": "daf7ccc6", - "metadata": {}, - "source": [ - "## ML-модель: предсказание высокого CTR\n", - "Target: верхний квартиль CTR. Фича: плотность показов + контрольные по возрасту/платформе и объёму." - ] - }, - { - "cell_type": "code", - "id": "6eeb3f56", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-12T19:11:32.533171Z", - "iopub.status.busy": "2025-12-12T19:11:32.532766Z", - "iopub.status.idle": "2025-12-12T19:11:32.689952Z", - "shell.execute_reply": "2025-12-12T19:11:32.688488Z" - }, - "ExecuteTime": { - "end_time": "2025-12-12T19:27:49.254084Z", - "start_time": "2025-12-12T19:27:49.213434Z" - } - }, - "source": [ - "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n", - "X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n", - "y = client[\"high_ctr\"]\n", - "X = X.copy()\n", - "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n", - "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", - "\n", - "numeric_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n", - "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n", - "\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "\n", - "preprocess = ColumnTransformer(\n", - " [\n", - " (\"num\", Pipeline([(\"scaler\", StandardScaler())]), numeric_cols),\n", - " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", - " ]\n", - ")\n", - "\n", - "model = Pipeline([(\"pre\", preprocess), (\"clf\", LogisticRegression(max_iter=1000))])\n", - "model.fit(X_train, y_train)\n", - "proba = model.predict_proba(X_test)[:, 1]\n", - "auc = roc_auc_score(y_test, proba)\n", - "coef = model.named_steps[\"clf\"].coef_[0]\n", - "features = model.named_steps[\"pre\"].get_feature_names_out()\n", - "coef_series = pd.Series(coef, index=features).sort_values(key=abs, ascending=False)\n", - "auc, coef_series.head(10)\n" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "(0.9995987243255224,\n", - " num__imp_total -17.459250\n", - " num__click_total 9.930772\n", - " num__avg_imp_per_day -0.977583\n", - " cat__device_platform_cd_iPadOS -0.189993\n", - " cat__device_platform_cd_Android 0.130996\n", - " num__age 0.060885\n", - " cat__device_platform_cd_iOS 0.039199\n", - " cat__gender_cd_M -0.026146\n", - " cat__gender_cd_F 0.006348\n", - " dtype: float64)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 4 - }, - { - "cell_type": "markdown", - "id": "071e5ad9", - "metadata": {}, - "source": [ - "## Вывод по гипотезе\n", - "- Сильное убывание CTR при росте плотности показов (график выше).\n", - "- В модели признак `avg_imp_per_day` имеет наибольший по модулю отрицательный коэффициент, AUC ~0.68: высокая плотность снижает шанс попасть в верхний квартиль CTR.\n", - "- Гипотеза подтверждена: спамная частота контактов убивает вовлечённость." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/alternative/saturation_effect/eda_utils.py b/alternative/saturation_effect/eda_utils.py deleted file mode 100644 index 802a6d8..0000000 --- a/alternative/saturation_effect/eda_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import Dict, Iterable, List - -import numpy as np -import pandas as pd - -# Paths and column groups -DATA_PATH = Path("dataset/ds.csv") -CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"] - -ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES] -PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES] -ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES] -PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES] -ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES] - -NUMERIC_COLS = ( - ACTIVE_IMP_COLS - + PASSIVE_IMP_COLS - + ACTIVE_CLICK_COLS - + PASSIVE_CLICK_COLS - + ORDER_COLS - + ["age"] -) -CAT_COLS = ["gender_cd", "device_platform_cd"] - - -def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series: - """Divide with protection against zero (works for Series and scalars).""" - if isinstance(denominator, pd.Series): - denom = denominator.replace(0, np.nan) - else: - denom = np.nan if float(denominator) == 0 else denominator - return numerator / denom - - -def normalize_gender(series: pd.Series) -> pd.Series: - cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper() - mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"} - return cleaned.map(mapping).fillna("UNKNOWN") - - -def normalize_device(series: pd.Series) -> pd.Series: - cleaned = series.fillna("unknown").astype(str).str.strip() - lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "") - mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"} - mapped = lowered.map(mapping) - fallback = cleaned.str.title() - return mapped.fillna(fallback) - - -def add_age_group(df: pd.DataFrame) -> pd.DataFrame: - bins = [0, 25, 35, 45, 55, np.inf] - labels = ["<25", "25-34", "35-44", "45-54", "55+"] - df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False) - return df - - -def add_totals(df: pd.DataFrame) -> pd.DataFrame: - df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1) - df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1) - df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1) - df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1) - df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1) - df["click_total"] = df["active_click_total"] + df["passive_click_total"] - df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] - df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"]) - df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"]) - df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"]) - df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"]) - df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"]) - return df - - -def add_flags(df: pd.DataFrame) -> pd.DataFrame: - df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int) - df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int) - df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1) - return df - - -def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame: - df = pd.read_csv(path) - df["business_dt"] = pd.to_datetime(df["business_dt"]) - df["gender_cd"] = normalize_gender(df["gender_cd"]) - df["device_platform_cd"] = normalize_device(df["device_platform_cd"]) - df = add_age_group(df) - df = add_totals(df) - df = add_flags(df) - return df - - -def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame: - stats = [] - for col in cols: - series = df[col] - stats.append( - { - "col": col, - "count": series.count(), - "mean": series.mean(), - "median": series.median(), - "std": series.std(), - "min": series.min(), - "q25": series.quantile(0.25), - "q75": series.quantile(0.75), - "max": series.max(), - "share_zero": (series == 0).mean(), - "p95": series.quantile(0.95), - "p99": series.quantile(0.99), - } - ) - return pd.DataFrame(stats) - - -def build_daily(df: pd.DataFrame) -> pd.DataFrame: - agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS - daily = df.groupby("business_dt")[agg_cols].sum().reset_index() - daily = add_totals(daily) - daily["day_of_week"] = daily["business_dt"].dt.day_name() - return daily - - -def build_client(df: pd.DataFrame) -> pd.DataFrame: - agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS} - meta_spec: Dict[str, str | callable] = { - "age": "median", - "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN", - "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan, - "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other", - } - agg_spec.update(meta_spec) - client = df.groupby("id").agg(agg_spec).reset_index() - contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") - imp_day = df.copy() - imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1) - max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day") - client = add_totals(client) - client = add_flags(client) - client = client.merge(contact_days, on="id", how="left") - client = client.merge(max_imp_day, on="id", how="left") - client = add_contact_density(client) - return client - - -def add_contact_density(df: pd.DataFrame) -> pd.DataFrame: - # contact_days must already be present - if "contact_days" in df.columns: - df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"]) - return df - return df diff --git a/spam_hypot/01_stat_analysis.ipynb b/spam_hypot/01_stat_analysis.ipynb deleted file mode 100644 index 4ce6983..0000000 --- a/spam_hypot/01_stat_analysis.ipynb +++ /dev/null @@ -1,188 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "4d7d3347", - "metadata": {}, - "source": [ - "# Спам-гипотеза: плотность показов vs CTR/CR\n", - "\n", - "Цель: проверить, что высокая плотность показов на контактный день снижает CTR и CR (спам-эффект)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7acbd1c8", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlite3\n", - "from pathlib import Path\n", - "import sys\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from scipy import stats\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "sns.set_theme(style=\"whitegrid\")\n", - "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", - "\n", - "project_root = Path.cwd().resolve()\n", - "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", - " project_root = project_root.parent\n", - "sys.path.append(str(project_root / \"preanalysis\"))\n", - "import eda_utils as eda\n", - "\n", - "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", - "conn = sqlite3.connect(db_path)\n", - "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", - "conn.close()\n", - "\n", - "for cols, name in [\n", - " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", - " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", - " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", - " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", - " (eda.ORDER_COLS, \"orders_amt_total\"),\n", - "]:\n", - " df[name] = df[cols].sum(axis=1)\n", - "\n", - "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", - "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", - "\n", - "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n", - "client = df.groupby(\"id\").agg(\n", - " {\n", - " \"imp_total\": \"sum\",\n", - " \"click_total\": \"sum\",\n", - " \"orders_amt_total\": \"sum\",\n", - " \"age\": \"median\",\n", - " \"gender_cd\": lambda s: s.mode().iat[0],\n", - " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", - " }\n", - ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n", - "\n", - "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n", - "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n", - "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n", - "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n", - "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n" - ] - }, - { - "cell_type": "markdown", - "id": "94eb2d26", - "metadata": {}, - "source": [ - "## Базовые статистики" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "287a09b4", - "metadata": {}, - "outputs": [], - "source": [ - "summary = client[[\"imp_total\", \"click_total\", \"orders_amt_total\", \"contact_days\", \"avg_imp_per_day\", \"ctr_all\", \"cr_click2order\"]].describe().T\n", - "missing = client.isna().mean().sort_values(ascending=False)\n", - "summary, missing.head(10)\n" - ] - }, - { - "cell_type": "markdown", - "id": "10cd44b7", - "metadata": {}, - "source": [ - "## Корреляции и тесты\n", - "Спирмен между плотностью и CTR/CR, а также Mann–Whitney между Q1 и Q4 по плотности." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88714a03", - "metadata": {}, - "outputs": [], - "source": [ - "corr_ctr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"ctr_all\"])\n", - "corr_cr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"cr_click2order\"])\n", - "q1 = client[\"avg_imp_per_day\"].quantile(0.25)\n", - "q4 = client[\"avg_imp_per_day\"].quantile(0.75)\n", - "low = client.loc[client[\"avg_imp_per_day\"] <= q1, \"ctr_all\"].dropna()\n", - "high = client.loc[client[\"avg_imp_per_day\"] >= q4, \"ctr_all\"].dropna()\n", - "wu = stats.mannwhitneyu(low, high, alternative=\"greater\")\n", - "{ \"spearman_ctr\": corr_ctr, \"spearman_cr\": corr_cr, \"mw_low_gt_high\": wu }\n" - ] - }, - { - "cell_type": "markdown", - "id": "20d492fa", - "metadata": {}, - "source": [ - "bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n", - "stats_bin = client.groupby(bins, observed=False).agg(\n", - " ctr_all=(\"ctr_all\", \"median\"),\n", - " cr_click2order=(\"cr_click2order\", \"median\"),\n", - " avg_imp_per_day=(\"avg_imp_per_day\", \"median\"),\n", - ").reset_index()\n", - "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n", - "fig, ax1 = plt.subplots(figsize=(12, 5))\n", - "ax2 = ax1.twinx()\n", - "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n", - "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n", - "ax1.set_ylabel(\"CTR\")\n", - "ax2.set_ylabel(\"CR click→order\")\n", - "plt.xticks(rotation=35)\n", - "ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n", - "fig.tight_layout()\n", - "plt.show()\n", - "stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "943f0d4b", - "metadata": {}, - "outputs": [], - "source": [ - "bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n", - "stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n", - "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n", - "fig, ax1 = plt.subplots(figsize=(12, 5))\n", - "ax2 = ax1.twinx()\n", - "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n", - "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n", - "ax1.set_ylabel(\"CTR\")\n", - "ax2.set_ylabel(\"CR click→order\")\n", - "plt.xticks(rotation=35)\n", - "ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n", - "fig.tight_layout()\n", - "plt.show()\n", - "stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/spam_hypot/02_models.ipynb b/spam_hypot/02_models.ipynb deleted file mode 100644 index aa7e4f2..0000000 --- a/spam_hypot/02_models.ipynb +++ /dev/null @@ -1,161 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "7254b4c1", - "metadata": {}, - "source": [ - "# Спам-гипотеза: сравнение моделей\n", - "\n", - "Target: `high_ctr` (верхний квартиль CTR)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7f54168", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlite3\n", - "from pathlib import Path\n", - "import sys\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from scipy import stats\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "sns.set_theme(style=\"whitegrid\")\n", - "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", - "\n", - "project_root = Path.cwd().resolve()\n", - "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", - " project_root = project_root.parent\n", - "sys.path.append(str(project_root / \"preanalysis\"))\n", - "import eda_utils as eda\n", - "\n", - "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", - "conn = sqlite3.connect(db_path)\n", - "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", - "conn.close()\n", - "\n", - "for cols, name in [\n", - " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", - " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", - " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", - " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", - " (eda.ORDER_COLS, \"orders_amt_total\"),\n", - "]:\n", - " df[name] = df[cols].sum(axis=1)\n", - "\n", - "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", - "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", - "\n", - "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n", - "client = df.groupby(\"id\").agg(\n", - " {\n", - " \"imp_total\": \"sum\",\n", - " \"click_total\": \"sum\",\n", - " \"orders_amt_total\": \"sum\",\n", - " \"age\": \"median\",\n", - " \"gender_cd\": lambda s: s.mode().iat[0],\n", - " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", - " }\n", - ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n", - "\n", - "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n", - "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n", - "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n", - "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n", - "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n" - ] - }, - { - "cell_type": "markdown", - "id": "21786c63", - "metadata": {}, - "source": [ - "## Модели: Logistic Regression vs GradientBoosting" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc8dbc94", - "metadata": {}, - "outputs": [], - "source": [ - "X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n", - "X = X.copy()\n", - "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n", - "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n", - "y = client[\"high_ctr\"]\n", - "\n", - "num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n", - "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n", - "pre = ColumnTransformer([\n", - " (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n", - " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", - "])\n", - "\n", - "log_reg = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\n", - "gb = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", - "res = {}\n", - "for name, model in [(\"log_reg\", log_reg), (\"gb\", gb)]:\n", - " model.fit(X_train, y_train)\n", - " proba = model.predict_proba(X_test)[:, 1]\n", - " res[name] = roc_auc_score(y_test, proba)\n", - "res\n" - ] - }, - { - "cell_type": "markdown", - "id": "203acf70", - "metadata": {}, - "source": [ - "## Важности признаков (GradientBoosting)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3eac9e17", - "metadata": {}, - "outputs": [], - "source": [ - "gb_model = gb\n", - "feat_names = gb_model.named_steps[\"pre\"].get_feature_names_out()\n", - "importances = gb_model.named_steps[\"clf\"].feature_importances_\n", - "imp_df = pd.DataFrame({\"feature\": feat_names, \"importance\": importances}).sort_values(\"importance\", ascending=False)\n", - "plt.figure(figsize=(8, 5))\n", - "sns.barplot(data=imp_df.head(15), x=\"importance\", y=\"feature\", palette=\"viridis\")\n", - "plt.title(\"Top-15 feature importances (GB)\")\n", - "plt.tight_layout()\n", - "plt.show()\n", - "imp_df.head(15)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/spam_hypot/03_best_model.ipynb b/spam_hypot/03_best_model.ipynb deleted file mode 100644 index 25d9956..0000000 --- a/spam_hypot/03_best_model.ipynb +++ /dev/null @@ -1,206 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "d88bf2d8", - "metadata": {}, - "source": [ - "# Спам-гипотеза: лучшая модель и визуализации\n", - "\n", - "Используем GradientBoostingClassifier (лучше логрега по AUC) для подтверждения гипотезы." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87f3f728", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlite3\n", - "from pathlib import Path\n", - "import sys\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from scipy import stats\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "sns.set_theme(style=\"whitegrid\")\n", - "plt.rcParams[\"figure.figsize\"] = (10, 5)\n", - "\n", - "project_root = Path.cwd().resolve()\n", - "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n", - " project_root = project_root.parent\n", - "sys.path.append(str(project_root / \"preanalysis\"))\n", - "import eda_utils as eda\n", - "\n", - "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n", - "conn = sqlite3.connect(db_path)\n", - "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n", - "conn.close()\n", - "\n", - "for cols, name in [\n", - " (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n", - " (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n", - " (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n", - " (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n", - " (eda.ORDER_COLS, \"orders_amt_total\"),\n", - "]:\n", - " df[name] = df[cols].sum(axis=1)\n", - "\n", - "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n", - "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n", - "\n", - "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n", - "client = df.groupby(\"id\").agg(\n", - " {\n", - " \"imp_total\": \"sum\",\n", - " \"click_total\": \"sum\",\n", - " \"orders_amt_total\": \"sum\",\n", - " \"age\": \"median\",\n", - " \"gender_cd\": lambda s: s.mode().iat[0],\n", - " \"device_platform_cd\": lambda s: s.mode().iat[0],\n", - " }\n", - ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n", - "\n", - "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n", - "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n", - "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n", - "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n", - "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n" - ] - }, - { - "cell_type": "markdown", - "id": "17da010c", - "metadata": {}, - "source": [ - "## Обучение лучшей модели" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81433d7e", - "metadata": {}, - "outputs": [], - "source": [ - "X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n", - "X = X.copy()\n", - "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n", - "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n", - "y = client[\"high_ctr\"]\n", - "\n", - "num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n", - "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n", - "pre = ColumnTransformer([\n", - " (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n", - " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", - "])\n", - "\n", - "best = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", - "best.fit(X_train, y_train)\n", - "proba = best.predict_proba(X_test)[:, 1]\n", - "auc = roc_auc_score(y_test, proba)\n", - "auc\n" - ] - }, - { - "cell_type": "markdown", - "id": "63f4db9b", - "metadata": {}, - "source": [ - "## Прогноз vs плотность показов" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f48584b5", - "metadata": {}, - "outputs": [], - "source": [ - "grid = pd.DataFrame({\"avg_imp_per_day\": np.linspace(client[\"avg_imp_per_day\"].min(), client[\"avg_imp_per_day\"].max(), 50)})\n", - "base = client.median(numeric_only=True)\n", - "base_gender = client[\"gender_cd\"].mode().iat[0]\n", - "base_device = client[\"device_platform_cd\"].mode().iat[0]\n", - "grid[\"imp_total\"] = base[\"imp_total\"]\n", - "grid[\"click_total\"] = base[\"click_total\"]\n", - "grid[\"age\"] = base[\"age\"]\n", - "grid[\"gender_cd\"] = base_gender\n", - "grid[\"device_platform_cd\"] = base_device\n", - "proba_grid = best.predict_proba(grid)[:, 1]\n", - "plt.figure(figsize=(10, 4))\n", - "plt.plot(grid[\"avg_imp_per_day\"], proba_grid, marker=\"o\")\n", - "plt.xlabel(\"avg_imp_per_day\")\n", - "plt.ylabel(\"P(high CTR)\")\n", - "plt.title(\"Предсказанная вероятность высокого CTR vs плотность показов\")\n", - "plt.tight_layout()\n", - "plt.show()\n" - ] - }, - { - "cell_type": "markdown", - "id": "32f73b44", - "metadata": {}, - "source": [ - "## График CTR и CR по тонким бинам (две оси)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb4d0190", - "metadata": {}, - "outputs": [], - "source": [ - "bins = pd.qcut(client[\"avg_imp_per_day\"], 15, duplicates=\"drop\")\n", - "stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n", - "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n", - "fig, ax1 = plt.subplots(figsize=(12, 5))\n", - "ax2 = ax1.twinx()\n", - "ax1.plot(stats_bin[\"bin_label\"], stats_bin[\"ctr_all\"], marker=\"o\", color=\"#4c72b0\", label=\"CTR\")\n", - "ax2.plot(stats_bin[\"bin_label\"], stats_bin[\"cr_click2order\"], marker=\"s\", color=\"#c44e52\", label=\"CR\")\n", - "ax1.set_ylabel(\"CTR\")\n", - "ax2.set_ylabel(\"CR click→order\")\n", - "ax1.set_xlabel(\"avg_imp_per_day bins\")\n", - "plt.xticks(rotation=35)\n", - "ax1.set_title(\"CTR и CR по 15 бинам avg_imp_per_day\")\n", - "fig.tight_layout()\n", - "plt.show()\n" - ] - }, - { - "cell_type": "markdown", - "id": "ebb2ca5e", - "metadata": {}, - "source": [ - "## Вывод\n", - "- AUC модели GradientBoosting > логрега; `avg_imp_per_day` ключевой драйвер: рост плотности снижает шанс попасть в верхний квартиль CTR.\n", - "- Биновые графики показывают монотонное падение CTR и CR при росте avg_imp_per_day.\n", - "- Гипотеза о спам-эффекте подтверждается как статистически, так и по ML-модели." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/spam_hypot/best_bins.png b/spam_hypot/best_bins.png index cad4d81..6090301 100644 Binary files a/spam_hypot/best_bins.png and b/spam_hypot/best_bins.png differ diff --git a/spam_hypot/best_model_prob.png b/spam_hypot/best_model_prob.png index cccd2de..fb205ae 100644 Binary files a/spam_hypot/best_model_prob.png and b/spam_hypot/best_model_prob.png differ diff --git a/spam_hypot/model_compare.py b/spam_hypot/model_compare.py index 450d033..c45f307 100644 --- a/spam_hypot/model_compare.py +++ b/spam_hypot/model_compare.py @@ -46,32 +46,62 @@ client = ( .merge(contact_days, on="id", how="left") .reset_index() ) +# ... всё как у тебя до расчёта client["ctr_all"] включительно + client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"]) client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"]) -client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int) -X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]] -X = X.copy() -X["gender_cd"] = eda.normalize_gender(X["gender_cd"]) -X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"]) -y = client["high_ctr"] +# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ --- +train_idx, test_idx = train_test_split( + client.index, test_size=0.2, random_state=42 +) -num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"] +train = client.loc[train_idx].copy() +test = client.loc[test_idx].copy() + +thr = train["ctr_all"].quantile(0.75) # порог только по train +train["high_ctr"] = (train["ctr_all"] >= thr).astype(int) +test["high_ctr"] = (test["ctr_all"] >= thr).astype(int) + +# --- ФИЧИ БЕЗ click_total (иначе это чит) --- +X_train = train[[ + "avg_imp_per_day", "imp_total", "contact_days", # можно оставить + "age", "gender_cd", "device_platform_cd" +]].copy() +X_test = test[[ + "avg_imp_per_day", "imp_total", "contact_days", + "age", "gender_cd", "device_platform_cd" +]].copy() + +X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"]) +X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"]) +X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"]) +X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"]) + +y_train = train["high_ctr"] +y_test = test["high_ctr"] + +num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"] cat_cols = ["gender_cd", "device_platform_cd"] + pre = ColumnTransformer([ - ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols), + ("num", Pipeline([ + ("imputer", SimpleImputer(strategy="median")), + ("scaler", StandardScaler()) + ]), num_cols), ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols), ]) log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))]) gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))]) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) results = {} for name, model in [("log_reg", log_reg), ("gb", gb)]: model.fit(X_train, y_train) proba = model.predict_proba(X_test)[:, 1] results[name] = roc_auc_score(y_test, proba) + +print("CTR threshold (train 0.75q):", thr) print("AUC results:", results) imp = gb.named_steps["clf"].feature_importances_ diff --git a/spam_hypot/stat_bins.png b/spam_hypot/stat_bins.png index 5d47f71..0510a10 100644 Binary files a/spam_hypot/stat_bins.png and b/spam_hypot/stat_bins.png differ