spam hypot

2025-12-12 23:17:56 +03:00
parent 174a96038f
commit ce595182b9
21 changed files with 2845 additions and 362 deletions
--- a/alternative/category_mix_uplift/analysis.ipynb
+++ b/alternative/category_mix_uplift/analysis.ipynb
--- a/alternative/category_mix_uplift/eda_utils.py
+++ b/alternative/category_mix_uplift/eda_utils.py
@@ -0,0 +1,154 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, Iterable, List
 import numpy as np
 import pandas as pd
 # Paths and column groups
 DATA_PATH = Path("dataset/ds.csv")
 CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
 ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
 PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
 ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
 PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
 ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
 NUMERIC_COLS = (
    ACTIVE_IMP_COLS
    + PASSIVE_IMP_COLS
    + ACTIVE_CLICK_COLS
    + PASSIVE_CLICK_COLS
    + ORDER_COLS
    + ["age"]
 )
 CAT_COLS = ["gender_cd", "device_platform_cd"]
 def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
    """Divide with protection against zero (works for Series and scalars)."""
    if isinstance(denominator, pd.Series):
        denom = denominator.replace(0, np.nan)
    else:
        denom = np.nan if float(denominator) == 0 else denominator
    return numerator / denom
 def normalize_gender(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
    mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
    return cleaned.map(mapping).fillna("UNKNOWN")
 def normalize_device(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("unknown").astype(str).str.strip()
    lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
    mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
    mapped = lowered.map(mapping)
    fallback = cleaned.str.title()
    return mapped.fillna(fallback)
 def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
    bins = [0, 25, 35, 45, 55, np.inf]
    labels = ["<25", "25-34", "35-44", "45-54", "55+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
    return df
 def add_totals(df: pd.DataFrame) -> pd.DataFrame:
    df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
    df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
    df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
    df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
    df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
    df["click_total"] = df["active_click_total"] + df["passive_click_total"]
    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
    df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
    df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
    df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
    df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
    df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
    return df
 def add_flags(df: pd.DataFrame) -> pd.DataFrame:
    df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
    df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
    return df
 def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["business_dt"] = pd.to_datetime(df["business_dt"])
    df["gender_cd"] = normalize_gender(df["gender_cd"])
    df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
    df = add_age_group(df)
    df = add_totals(df)
    df = add_flags(df)
    return df
 def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    stats = []
    for col in cols:
        series = df[col]
        stats.append(
            {
                "col": col,
                "count": series.count(),
                "mean": series.mean(),
                "median": series.median(),
                "std": series.std(),
                "min": series.min(),
                "q25": series.quantile(0.25),
                "q75": series.quantile(0.75),
                "max": series.max(),
                "share_zero": (series == 0).mean(),
                "p95": series.quantile(0.95),
                "p99": series.quantile(0.99),
            }
        )
    return pd.DataFrame(stats)
 def build_daily(df: pd.DataFrame) -> pd.DataFrame:
    agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
    daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
    daily = add_totals(daily)
    daily["day_of_week"] = daily["business_dt"].dt.day_name()
    return daily
 def build_client(df: pd.DataFrame) -> pd.DataFrame:
    agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
    meta_spec: Dict[str, str | callable] = {
        "age": "median",
        "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
        "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
        "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
    }
    agg_spec.update(meta_spec)
    client = df.groupby("id").agg(agg_spec).reset_index()
    contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
    imp_day = df.copy()
    imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
    max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
    client = add_totals(client)
    client = add_flags(client)
    client = client.merge(contact_days, on="id", how="left")
    client = client.merge(max_imp_day, on="id", how="left")
    client = add_contact_density(client)
    return client
 def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
    # contact_days must already be present
    if "contact_days" in df.columns:
        df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
    return df
    return df
--- a/alternative/contact_frequency_orders/analysis.ipynb
+++ b/alternative/contact_frequency_orders/analysis.ipynb
--- a/alternative/contact_frequency_orders/eda_utils.py
+++ b/alternative/contact_frequency_orders/eda_utils.py
@@ -0,0 +1,154 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, Iterable, List
 import numpy as np
 import pandas as pd
 # Paths and column groups
 DATA_PATH = Path("dataset/ds.csv")
 CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
 ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
 PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
 ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
 PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
 ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
 NUMERIC_COLS = (
    ACTIVE_IMP_COLS
    + PASSIVE_IMP_COLS
    + ACTIVE_CLICK_COLS
    + PASSIVE_CLICK_COLS
    + ORDER_COLS
    + ["age"]
 )
 CAT_COLS = ["gender_cd", "device_platform_cd"]
 def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
    """Divide with protection against zero (works for Series and scalars)."""
    if isinstance(denominator, pd.Series):
        denom = denominator.replace(0, np.nan)
    else:
        denom = np.nan if float(denominator) == 0 else denominator
    return numerator / denom
 def normalize_gender(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
    mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
    return cleaned.map(mapping).fillna("UNKNOWN")
 def normalize_device(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("unknown").astype(str).str.strip()
    lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
    mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
    mapped = lowered.map(mapping)
    fallback = cleaned.str.title()
    return mapped.fillna(fallback)
 def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
    bins = [0, 25, 35, 45, 55, np.inf]
    labels = ["<25", "25-34", "35-44", "45-54", "55+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
    return df
 def add_totals(df: pd.DataFrame) -> pd.DataFrame:
    df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
    df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
    df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
    df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
    df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
    df["click_total"] = df["active_click_total"] + df["passive_click_total"]
    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
    df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
    df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
    df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
    df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
    df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
    return df
 def add_flags(df: pd.DataFrame) -> pd.DataFrame:
    df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
    df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
    return df
 def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["business_dt"] = pd.to_datetime(df["business_dt"])
    df["gender_cd"] = normalize_gender(df["gender_cd"])
    df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
    df = add_age_group(df)
    df = add_totals(df)
    df = add_flags(df)
    return df
 def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    stats = []
    for col in cols:
        series = df[col]
        stats.append(
            {
                "col": col,
                "count": series.count(),
                "mean": series.mean(),
                "median": series.median(),
                "std": series.std(),
                "min": series.min(),
                "q25": series.quantile(0.25),
                "q75": series.quantile(0.75),
                "max": series.max(),
                "share_zero": (series == 0).mean(),
                "p95": series.quantile(0.95),
                "p99": series.quantile(0.99),
            }
        )
    return pd.DataFrame(stats)
 def build_daily(df: pd.DataFrame) -> pd.DataFrame:
    agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
    daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
    daily = add_totals(daily)
    daily["day_of_week"] = daily["business_dt"].dt.day_name()
    return daily
 def build_client(df: pd.DataFrame) -> pd.DataFrame:
    agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
    meta_spec: Dict[str, str | callable] = {
        "age": "median",
        "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
        "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
        "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
    }
    agg_spec.update(meta_spec)
    client = df.groupby("id").agg(agg_spec).reset_index()
    contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
    imp_day = df.copy()
    imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
    max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
    client = add_totals(client)
    client = add_flags(client)
    client = client.merge(contact_days, on="id", how="left")
    client = client.merge(max_imp_day, on="id", how="left")
    client = add_contact_density(client)
    return client
 def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
    # contact_days must already be present
    if "contact_days" in df.columns:
        df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
    return df
    return df
--- a/alternative/device_orders/analysis.ipynb
+++ b/alternative/device_orders/analysis.ipynb
--- a/alternative/device_orders/eda_utils.py
+++ b/alternative/device_orders/eda_utils.py
@@ -0,0 +1,154 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, Iterable, List
 import numpy as np
 import pandas as pd
 # Paths and column groups
 DATA_PATH = Path("dataset/ds.csv")
 CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
 ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
 PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
 ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
 PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
 ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
 NUMERIC_COLS = (
    ACTIVE_IMP_COLS
    + PASSIVE_IMP_COLS
    + ACTIVE_CLICK_COLS
    + PASSIVE_CLICK_COLS
    + ORDER_COLS
    + ["age"]
 )
 CAT_COLS = ["gender_cd", "device_platform_cd"]
 def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
    """Divide with protection against zero (works for Series and scalars)."""
    if isinstance(denominator, pd.Series):
        denom = denominator.replace(0, np.nan)
    else:
        denom = np.nan if float(denominator) == 0 else denominator
    return numerator / denom
 def normalize_gender(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
    mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
    return cleaned.map(mapping).fillna("UNKNOWN")
 def normalize_device(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("unknown").astype(str).str.strip()
    lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
    mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
    mapped = lowered.map(mapping)
    fallback = cleaned.str.title()
    return mapped.fillna(fallback)
 def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
    bins = [0, 25, 35, 45, 55, np.inf]
    labels = ["<25", "25-34", "35-44", "45-54", "55+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
    return df
 def add_totals(df: pd.DataFrame) -> pd.DataFrame:
    df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
    df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
    df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
    df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
    df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
    df["click_total"] = df["active_click_total"] + df["passive_click_total"]
    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
    df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
    df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
    df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
    df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
    df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
    return df
 def add_flags(df: pd.DataFrame) -> pd.DataFrame:
    df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
    df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
    return df
 def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["business_dt"] = pd.to_datetime(df["business_dt"])
    df["gender_cd"] = normalize_gender(df["gender_cd"])
    df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
    df = add_age_group(df)
    df = add_totals(df)
    df = add_flags(df)
    return df
 def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    stats = []
    for col in cols:
        series = df[col]
        stats.append(
            {
                "col": col,
                "count": series.count(),
                "mean": series.mean(),
                "median": series.median(),
                "std": series.std(),
                "min": series.min(),
                "q25": series.quantile(0.25),
                "q75": series.quantile(0.75),
                "max": series.max(),
                "share_zero": (series == 0).mean(),
                "p95": series.quantile(0.95),
                "p99": series.quantile(0.99),
            }
        )
    return pd.DataFrame(stats)
 def build_daily(df: pd.DataFrame) -> pd.DataFrame:
    agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
    daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
    daily = add_totals(daily)
    daily["day_of_week"] = daily["business_dt"].dt.day_name()
    return daily
 def build_client(df: pd.DataFrame) -> pd.DataFrame:
    agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
    meta_spec: Dict[str, str | callable] = {
        "age": "median",
        "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
        "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
        "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
    }
    agg_spec.update(meta_spec)
    client = df.groupby("id").agg(agg_spec).reset_index()
    contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
    imp_day = df.copy()
    imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
    max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
    client = add_totals(client)
    client = add_flags(client)
    client = client.merge(contact_days, on="id", how="left")
    client = client.merge(max_imp_day, on="id", how="left")
    client = add_contact_density(client)
    return client
 def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
    # contact_days must already be present
    if "contact_days" in df.columns:
        df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
    return df
    return df
--- a/alternative/ent_passive_ctr_uplift/analysis.ipynb
+++ b/alternative/ent_passive_ctr_uplift/analysis.ipynb
--- a/alternative/ent_passive_ctr_uplift/eda_utils.py
+++ b/alternative/ent_passive_ctr_uplift/eda_utils.py
@@ -0,0 +1,154 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, Iterable, List
 import numpy as np
 import pandas as pd
 # Paths and column groups
 DATA_PATH = Path("dataset/ds.csv")
 CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
 ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
 PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
 ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
 PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
 ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
 NUMERIC_COLS = (
    ACTIVE_IMP_COLS
    + PASSIVE_IMP_COLS
    + ACTIVE_CLICK_COLS
    + PASSIVE_CLICK_COLS
    + ORDER_COLS
    + ["age"]
 )
 CAT_COLS = ["gender_cd", "device_platform_cd"]
 def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
    """Divide with protection against zero (works for Series and scalars)."""
    if isinstance(denominator, pd.Series):
        denom = denominator.replace(0, np.nan)
    else:
        denom = np.nan if float(denominator) == 0 else denominator
    return numerator / denom
 def normalize_gender(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
    mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
    return cleaned.map(mapping).fillna("UNKNOWN")
 def normalize_device(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("unknown").astype(str).str.strip()
    lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
    mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
    mapped = lowered.map(mapping)
    fallback = cleaned.str.title()
    return mapped.fillna(fallback)
 def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
    bins = [0, 25, 35, 45, 55, np.inf]
    labels = ["<25", "25-34", "35-44", "45-54", "55+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
    return df
 def add_totals(df: pd.DataFrame) -> pd.DataFrame:
    df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
    df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
    df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
    df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
    df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
    df["click_total"] = df["active_click_total"] + df["passive_click_total"]
    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
    df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
    df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
    df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
    df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
    df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
    return df
 def add_flags(df: pd.DataFrame) -> pd.DataFrame:
    df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
    df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
    return df
 def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["business_dt"] = pd.to_datetime(df["business_dt"])
    df["gender_cd"] = normalize_gender(df["gender_cd"])
    df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
    df = add_age_group(df)
    df = add_totals(df)
    df = add_flags(df)
    return df
 def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    stats = []
    for col in cols:
        series = df[col]
        stats.append(
            {
                "col": col,
                "count": series.count(),
                "mean": series.mean(),
                "median": series.median(),
                "std": series.std(),
                "min": series.min(),
                "q25": series.quantile(0.25),
                "q75": series.quantile(0.75),
                "max": series.max(),
                "share_zero": (series == 0).mean(),
                "p95": series.quantile(0.95),
                "p99": series.quantile(0.99),
            }
        )
    return pd.DataFrame(stats)
 def build_daily(df: pd.DataFrame) -> pd.DataFrame:
    agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
    daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
    daily = add_totals(daily)
    daily["day_of_week"] = daily["business_dt"].dt.day_name()
    return daily
 def build_client(df: pd.DataFrame) -> pd.DataFrame:
    agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
    meta_spec: Dict[str, str | callable] = {
        "age": "median",
        "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
        "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
        "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
    }
    agg_spec.update(meta_spec)
    client = df.groupby("id").agg(agg_spec).reset_index()
    contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
    imp_day = df.copy()
    imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
    max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
    client = add_totals(client)
    client = add_flags(client)
    client = client.merge(contact_days, on="id", how="left")
    client = client.merge(max_imp_day, on="id", how="left")
    client = add_contact_density(client)
    return client
 def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
    # contact_days must already be present
    if "contact_days" in df.columns:
        df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
    return df
    return df
--- a/alternative/passive_share_orders/analysis.ipynb
+++ b/alternative/passive_share_orders/analysis.ipynb
--- a/alternative/passive_share_orders/eda_utils.py
+++ b/alternative/passive_share_orders/eda_utils.py
@@ -0,0 +1,154 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, Iterable, List
 import numpy as np
 import pandas as pd
 # Paths and column groups
 DATA_PATH = Path("dataset/ds.csv")
 CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
 ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
 PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
 ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
 PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
 ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
 NUMERIC_COLS = (
    ACTIVE_IMP_COLS
    + PASSIVE_IMP_COLS
    + ACTIVE_CLICK_COLS
    + PASSIVE_CLICK_COLS
    + ORDER_COLS
    + ["age"]
 )
 CAT_COLS = ["gender_cd", "device_platform_cd"]
 def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
    """Divide with protection against zero (works for Series and scalars)."""
    if isinstance(denominator, pd.Series):
        denom = denominator.replace(0, np.nan)
    else:
        denom = np.nan if float(denominator) == 0 else denominator
    return numerator / denom
 def normalize_gender(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
    mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
    return cleaned.map(mapping).fillna("UNKNOWN")
 def normalize_device(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("unknown").astype(str).str.strip()
    lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
    mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
    mapped = lowered.map(mapping)
    fallback = cleaned.str.title()
    return mapped.fillna(fallback)
 def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
    bins = [0, 25, 35, 45, 55, np.inf]
    labels = ["<25", "25-34", "35-44", "45-54", "55+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
    return df
 def add_totals(df: pd.DataFrame) -> pd.DataFrame:
    df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
    df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
    df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
    df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
    df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
    df["click_total"] = df["active_click_total"] + df["passive_click_total"]
    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
    df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
    df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
    df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
    df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
    df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
    return df
 def add_flags(df: pd.DataFrame) -> pd.DataFrame:
    df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
    df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
    return df
 def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["business_dt"] = pd.to_datetime(df["business_dt"])
    df["gender_cd"] = normalize_gender(df["gender_cd"])
    df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
    df = add_age_group(df)
    df = add_totals(df)
    df = add_flags(df)
    return df
 def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    stats = []
    for col in cols:
        series = df[col]
        stats.append(
            {
                "col": col,
                "count": series.count(),
                "mean": series.mean(),
                "median": series.median(),
                "std": series.std(),
                "min": series.min(),
                "q25": series.quantile(0.25),
                "q75": series.quantile(0.75),
                "max": series.max(),
                "share_zero": (series == 0).mean(),
                "p95": series.quantile(0.95),
                "p99": series.quantile(0.99),
            }
        )
    return pd.DataFrame(stats)
 def build_daily(df: pd.DataFrame) -> pd.DataFrame:
    agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
    daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
    daily = add_totals(daily)
    daily["day_of_week"] = daily["business_dt"].dt.day_name()
    return daily
 def build_client(df: pd.DataFrame) -> pd.DataFrame:
    agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
    meta_spec: Dict[str, str | callable] = {
        "age": "median",
        "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
        "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
        "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
    }
    agg_spec.update(meta_spec)
    client = df.groupby("id").agg(agg_spec).reset_index()
    contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
    imp_day = df.copy()
    imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
    max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
    client = add_totals(client)
    client = add_flags(client)
    client = client.merge(contact_days, on="id", how="left")
    client = client.merge(max_imp_day, on="id", how="left")
    client = add_contact_density(client)
    return client
 def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
    # contact_days must already be present
    if "contact_days" in df.columns:
        df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
    return df
    return df
--- a/alternative/saturation_effect/analysis.ipynb
+++ b/alternative/saturation_effect/analysis.ipynb
--- a/alternative/saturation_effect/eda_utils.py
+++ b/alternative/saturation_effect/eda_utils.py
@@ -0,0 +1,154 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, Iterable, List
 import numpy as np
 import pandas as pd
 # Paths and column groups
 DATA_PATH = Path("dataset/ds.csv")
 CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
 ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
 PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
 ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
 PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
 ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
 NUMERIC_COLS = (
    ACTIVE_IMP_COLS
    + PASSIVE_IMP_COLS
    + ACTIVE_CLICK_COLS
    + PASSIVE_CLICK_COLS
    + ORDER_COLS
    + ["age"]
 )
 CAT_COLS = ["gender_cd", "device_platform_cd"]
 def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
    """Divide with protection against zero (works for Series and scalars)."""
    if isinstance(denominator, pd.Series):
        denom = denominator.replace(0, np.nan)
    else:
        denom = np.nan if float(denominator) == 0 else denominator
    return numerator / denom
 def normalize_gender(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
    mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
    return cleaned.map(mapping).fillna("UNKNOWN")
 def normalize_device(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("unknown").astype(str).str.strip()
    lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
    mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
    mapped = lowered.map(mapping)
    fallback = cleaned.str.title()
    return mapped.fillna(fallback)
 def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
    bins = [0, 25, 35, 45, 55, np.inf]
    labels = ["<25", "25-34", "35-44", "45-54", "55+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
    return df
 def add_totals(df: pd.DataFrame) -> pd.DataFrame:
    df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
    df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
    df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
    df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
    df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
    df["click_total"] = df["active_click_total"] + df["passive_click_total"]
    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
    df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
    df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
    df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
    df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
    df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
    return df
 def add_flags(df: pd.DataFrame) -> pd.DataFrame:
    df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
    df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
    return df
 def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["business_dt"] = pd.to_datetime(df["business_dt"])
    df["gender_cd"] = normalize_gender(df["gender_cd"])
    df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
    df = add_age_group(df)
    df = add_totals(df)
    df = add_flags(df)
    return df
 def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    stats = []
    for col in cols:
        series = df[col]
        stats.append(
            {
                "col": col,
                "count": series.count(),
                "mean": series.mean(),
                "median": series.median(),
                "std": series.std(),
                "min": series.min(),
                "q25": series.quantile(0.25),
                "q75": series.quantile(0.75),
                "max": series.max(),
                "share_zero": (series == 0).mean(),
                "p95": series.quantile(0.95),
                "p99": series.quantile(0.99),
            }
        )
    return pd.DataFrame(stats)
 def build_daily(df: pd.DataFrame) -> pd.DataFrame:
    agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
    daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
    daily = add_totals(daily)
    daily["day_of_week"] = daily["business_dt"].dt.day_name()
    return daily
 def build_client(df: pd.DataFrame) -> pd.DataFrame:
    agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
    meta_spec: Dict[str, str | callable] = {
        "age": "median",
        "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
        "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
        "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
    }
    agg_spec.update(meta_spec)
    client = df.groupby("id").agg(agg_spec).reset_index()
    contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
    imp_day = df.copy()
    imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
    max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
    client = add_totals(client)
    client = add_flags(client)
    client = client.merge(contact_days, on="id", how="left")
    client = client.merge(max_imp_day, on="id", how="left")
    client = add_contact_density(client)
    return client
 def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
    # contact_days must already be present
    if "contact_days" in df.columns:
        df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
    return df
    return df
--- a/spam_hypot/01_stat_analysis.ipynb
+++ b/spam_hypot/01_stat_analysis.ipynb
@@ -0,0 +1,188 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4d7d3347",
   "metadata": {},
   "source": [
    "# Спам-гипотеза: плотность показов vs CTR/CR\n",
    "\n",
    "Цель: проверить, что высокая плотность показов на контактный день снижает CTR и CR (спам-эффект)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7acbd1c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "from pathlib import Path\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import stats\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "sns.set_theme(style=\"whitegrid\")\n",
    "plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
    "\n",
    "project_root = Path.cwd().resolve()\n",
    "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
    "    project_root = project_root.parent\n",
    "sys.path.append(str(project_root / \"preanalysis\"))\n",
    "import eda_utils as eda\n",
    "\n",
    "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
    "conn = sqlite3.connect(db_path)\n",
    "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
    "conn.close()\n",
    "\n",
    "for cols, name in [\n",
    "    (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
    "    (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
    "    (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
    "    (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
    "    (eda.ORDER_COLS, \"orders_amt_total\"),\n",
    "]:\n",
    "    df[name] = df[cols].sum(axis=1)\n",
    "\n",
    "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
    "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
    "\n",
    "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
    "client = df.groupby(\"id\").agg(\n",
    "    {\n",
    "        \"imp_total\": \"sum\",\n",
    "        \"click_total\": \"sum\",\n",
    "        \"orders_amt_total\": \"sum\",\n",
    "        \"age\": \"median\",\n",
    "        \"gender_cd\": lambda s: s.mode().iat[0],\n",
    "        \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
    "    }\n",
    ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
    "\n",
    "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
    "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
    "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
    "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
    "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94eb2d26",
   "metadata": {},
   "source": [
    "## Базовые статистики"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "287a09b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary = client[[\"imp_total\", \"click_total\", \"orders_amt_total\", \"contact_days\", \"avg_imp_per_day\", \"ctr_all\", \"cr_click2order\"]].describe().T\n",
    "missing = client.isna().mean().sort_values(ascending=False)\n",
    "summary, missing.head(10)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10cd44b7",
   "metadata": {},
   "source": [
    "## Корреляции и тесты\n",
    "Спирмен между плотностью и CTR/CR, а также Mann–Whitney между Q1 и Q4 по плотности."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88714a03",
   "metadata": {},
   "outputs": [],
   "source": [
    "corr_ctr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"ctr_all\"])\n",
    "corr_cr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"cr_click2order\"])\n",
    "q1 = client[\"avg_imp_per_day\"].quantile(0.25)\n",
    "q4 = client[\"avg_imp_per_day\"].quantile(0.75)\n",
    "low = client.loc[client[\"avg_imp_per_day\"] <= q1, \"ctr_all\"].dropna()\n",
    "high = client.loc[client[\"avg_imp_per_day\"] >= q4, \"ctr_all\"].dropna()\n",
    "wu = stats.mannwhitneyu(low, high, alternative=\"greater\")\n",
    "{ \"spearman_ctr\": corr_ctr, \"spearman_cr\": corr_cr, \"mw_low_gt_high\": wu }\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20d492fa",
   "metadata": {},
   "source": [
    "bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
    "stats_bin = client.groupby(bins, observed=False).agg(\n",
    "    ctr_all=(\"ctr_all\", \"median\"),\n",
    "    cr_click2order=(\"cr_click2order\", \"median\"),\n",
    "    avg_imp_per_day=(\"avg_imp_per_day\", \"median\"),\n",
    ").reset_index()\n",
    "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
    "fig, ax1 = plt.subplots(figsize=(12, 5))\n",
    "ax2 = ax1.twinx()\n",
    "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
    "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
    "ax1.set_ylabel(\"CTR\")\n",
    "ax2.set_ylabel(\"CR click→order\")\n",
    "plt.xticks(rotation=35)\n",
    "ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
    "fig.tight_layout()\n",
    "plt.show()\n",
    "stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "943f0d4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
    "stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
    "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
    "fig, ax1 = plt.subplots(figsize=(12, 5))\n",
    "ax2 = ax1.twinx()\n",
    "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
    "sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
    "ax1.set_ylabel(\"CTR\")\n",
    "ax2.set_ylabel(\"CR click→order\")\n",
    "plt.xticks(rotation=35)\n",
    "ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
    "fig.tight_layout()\n",
    "plt.show()\n",
    "stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/spam_hypot/02_models.ipynb
+++ b/spam_hypot/02_models.ipynb
@@ -0,0 +1,161 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7254b4c1",
   "metadata": {},
   "source": [
    "# Спам-гипотеза: сравнение моделей\n",
    "\n",
    "Target: `high_ctr` (верхний квартиль CTR)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7f54168",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "from pathlib import Path\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import stats\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "sns.set_theme(style=\"whitegrid\")\n",
    "plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
    "\n",
    "project_root = Path.cwd().resolve()\n",
    "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
    "    project_root = project_root.parent\n",
    "sys.path.append(str(project_root / \"preanalysis\"))\n",
    "import eda_utils as eda\n",
    "\n",
    "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
    "conn = sqlite3.connect(db_path)\n",
    "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
    "conn.close()\n",
    "\n",
    "for cols, name in [\n",
    "    (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
    "    (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
    "    (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
    "    (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
    "    (eda.ORDER_COLS, \"orders_amt_total\"),\n",
    "]:\n",
    "    df[name] = df[cols].sum(axis=1)\n",
    "\n",
    "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
    "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
    "\n",
    "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
    "client = df.groupby(\"id\").agg(\n",
    "    {\n",
    "        \"imp_total\": \"sum\",\n",
    "        \"click_total\": \"sum\",\n",
    "        \"orders_amt_total\": \"sum\",\n",
    "        \"age\": \"median\",\n",
    "        \"gender_cd\": lambda s: s.mode().iat[0],\n",
    "        \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
    "    }\n",
    ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
    "\n",
    "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
    "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
    "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
    "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
    "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21786c63",
   "metadata": {},
   "source": [
    "## Модели: Logistic Regression vs GradientBoosting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc8dbc94",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
    "X = X.copy()\n",
    "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
    "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
    "y = client[\"high_ctr\"]\n",
    "\n",
    "num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n",
    "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
    "pre = ColumnTransformer([\n",
    "    (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n",
    "    (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
    "])\n",
    "\n",
    "log_reg = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\n",
    "gb = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
    "res = {}\n",
    "for name, model in [(\"log_reg\", log_reg), (\"gb\", gb)]:\n",
    "    model.fit(X_train, y_train)\n",
    "    proba = model.predict_proba(X_test)[:, 1]\n",
    "    res[name] = roc_auc_score(y_test, proba)\n",
    "res\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "203acf70",
   "metadata": {},
   "source": [
    "## Важности признаков (GradientBoosting)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3eac9e17",
   "metadata": {},
   "outputs": [],
   "source": [
    "gb_model = gb\n",
    "feat_names = gb_model.named_steps[\"pre\"].get_feature_names_out()\n",
    "importances = gb_model.named_steps[\"clf\"].feature_importances_\n",
    "imp_df = pd.DataFrame({\"feature\": feat_names, \"importance\": importances}).sort_values(\"importance\", ascending=False)\n",
    "plt.figure(figsize=(8, 5))\n",
    "sns.barplot(data=imp_df.head(15), x=\"importance\", y=\"feature\", palette=\"viridis\")\n",
    "plt.title(\"Top-15 feature importances (GB)\")\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "imp_df.head(15)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/spam_hypot/03_best_model.ipynb
+++ b/spam_hypot/03_best_model.ipynb
@@ -0,0 +1,206 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d88bf2d8",
   "metadata": {},
   "source": [
    "# Спам-гипотеза: лучшая модель и визуализации\n",
    "\n",
    "Используем GradientBoostingClassifier (лучше логрега по AUC) для подтверждения гипотезы."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87f3f728",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "from pathlib import Path\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import stats\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "sns.set_theme(style=\"whitegrid\")\n",
    "plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
    "\n",
    "project_root = Path.cwd().resolve()\n",
    "while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
    "    project_root = project_root.parent\n",
    "sys.path.append(str(project_root / \"preanalysis\"))\n",
    "import eda_utils as eda\n",
    "\n",
    "db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
    "conn = sqlite3.connect(db_path)\n",
    "df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
    "conn.close()\n",
    "\n",
    "for cols, name in [\n",
    "    (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
    "    (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
    "    (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
    "    (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
    "    (eda.ORDER_COLS, \"orders_amt_total\"),\n",
    "]:\n",
    "    df[name] = df[cols].sum(axis=1)\n",
    "\n",
    "df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
    "df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
    "\n",
    "contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
    "client = df.groupby(\"id\").agg(\n",
    "    {\n",
    "        \"imp_total\": \"sum\",\n",
    "        \"click_total\": \"sum\",\n",
    "        \"orders_amt_total\": \"sum\",\n",
    "        \"age\": \"median\",\n",
    "        \"gender_cd\": lambda s: s.mode().iat[0],\n",
    "        \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
    "    }\n",
    ").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
    "\n",
    "client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
    "client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
    "client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
    "client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
    "client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17da010c",
   "metadata": {},
   "source": [
    "## Обучение лучшей модели"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81433d7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
    "X = X.copy()\n",
    "X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
    "X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
    "y = client[\"high_ctr\"]\n",
    "\n",
    "num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n",
    "cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
    "pre = ColumnTransformer([\n",
    "    (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n",
    "    (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
    "])\n",
    "\n",
    "best = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
    "best.fit(X_train, y_train)\n",
    "proba = best.predict_proba(X_test)[:, 1]\n",
    "auc = roc_auc_score(y_test, proba)\n",
    "auc\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "63f4db9b",
   "metadata": {},
   "source": [
    "## Прогноз vs плотность показов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f48584b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "grid = pd.DataFrame({\"avg_imp_per_day\": np.linspace(client[\"avg_imp_per_day\"].min(), client[\"avg_imp_per_day\"].max(), 50)})\n",
    "base = client.median(numeric_only=True)\n",
    "base_gender = client[\"gender_cd\"].mode().iat[0]\n",
    "base_device = client[\"device_platform_cd\"].mode().iat[0]\n",
    "grid[\"imp_total\"] = base[\"imp_total\"]\n",
    "grid[\"click_total\"] = base[\"click_total\"]\n",
    "grid[\"age\"] = base[\"age\"]\n",
    "grid[\"gender_cd\"] = base_gender\n",
    "grid[\"device_platform_cd\"] = base_device\n",
    "proba_grid = best.predict_proba(grid)[:, 1]\n",
    "plt.figure(figsize=(10, 4))\n",
    "plt.plot(grid[\"avg_imp_per_day\"], proba_grid, marker=\"o\")\n",
    "plt.xlabel(\"avg_imp_per_day\")\n",
    "plt.ylabel(\"P(high CTR)\")\n",
    "plt.title(\"Предсказанная вероятность высокого CTR vs плотность показов\")\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32f73b44",
   "metadata": {},
   "source": [
    "## График CTR и CR по тонким бинам (две оси)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb4d0190",
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = pd.qcut(client[\"avg_imp_per_day\"], 15, duplicates=\"drop\")\n",
    "stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
    "stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
    "fig, ax1 = plt.subplots(figsize=(12, 5))\n",
    "ax2 = ax1.twinx()\n",
    "ax1.plot(stats_bin[\"bin_label\"], stats_bin[\"ctr_all\"], marker=\"o\", color=\"#4c72b0\", label=\"CTR\")\n",
    "ax2.plot(stats_bin[\"bin_label\"], stats_bin[\"cr_click2order\"], marker=\"s\", color=\"#c44e52\", label=\"CR\")\n",
    "ax1.set_ylabel(\"CTR\")\n",
    "ax2.set_ylabel(\"CR click→order\")\n",
    "ax1.set_xlabel(\"avg_imp_per_day bins\")\n",
    "plt.xticks(rotation=35)\n",
    "ax1.set_title(\"CTR и CR по 15 бинам avg_imp_per_day\")\n",
    "fig.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebb2ca5e",
   "metadata": {},
   "source": [
    "## Вывод\n",
    "- AUC модели GradientBoosting > логрега; `avg_imp_per_day` ключевой драйвер: рост плотности снижает шанс попасть в верхний квартиль CTR.\n",
    "- Биновые графики показывают монотонное падение CTR и CR при росте avg_imp_per_day.\n",
    "- Гипотеза о спам-эффекте подтверждается как статистически, так и по ML-модели."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/spam_hypot/best_bins.png
+++ b/spam_hypot/best_bins.png
--- a/spam_hypot/best_model_and_plots.py
+++ b/spam_hypot/best_model_and_plots.py
@@ -0,0 +1,114 @@
 import sqlite3
 from pathlib import Path
 import sys
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.metrics import roc_auc_score
 sns.set_theme(style="whitegrid")
 plt.rcParams["figure.figsize"] = (10, 5)
 project_root = Path(__file__).resolve().parent.parent
 sys.path.append(str(project_root / "preanalysis"))
 import eda_utils as eda  # noqa: E402
 db_path = project_root / "dataset" / "ds.sqlite"
 conn = sqlite3.connect(db_path)
 df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
 conn.close()
 for cols, name in [
    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
    (eda.ORDER_COLS, "orders_amt_total"),
 ]:
    df[name] = df[cols].sum(axis=1)
 df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
 df["click_total"] = df["active_click_total"] + df["passive_click_total"]
 contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
 client = (
    df.groupby("id")
    .agg(
        imp_total=("imp_total", "sum"),
        click_total=("click_total", "sum"),
        orders_amt_total=("orders_amt_total", "sum"),
        age=("age", "median"),
        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
    )
    .merge(contact_days, on="id", how="left")
    .reset_index()
 )
 client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
 client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
 client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
 client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
 X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
 X = X.copy()
 X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
 X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
 y = client["high_ctr"]
 num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
 cat_cols = ["gender_cd", "device_platform_cd"]
 pre = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
 ])
 model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
 model.fit(X_train, y_train)
 proba = model.predict_proba(X_test)[:, 1]
 auc = roc_auc_score(y_test, proba)
 print("Best model AUC:", auc)
 # Probability vs avg_imp_per_day
 grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)})
 base = client.median(numeric_only=True)
 base_gender = client["gender_cd"].mode().iat[0]
 base_device = client["device_platform_cd"].mode().iat[0]
 grid["imp_total"] = base["imp_total"]
 grid["click_total"] = base["click_total"]
 grid["age"] = base["age"]
 grid["gender_cd"] = base_gender
 grid["device_platform_cd"] = base_device
 proba_grid = model.predict_proba(grid)[:, 1]
 plt.figure(figsize=(10, 4))
 plt.plot(grid["avg_imp_per_day"], proba_grid, marker="o")
 plt.xlabel("avg_imp_per_day")
 plt.ylabel("P(high CTR)")
 plt.title("Предсказанная вероятность высокого CTR vs плотность показов")
 plt.tight_layout()
 plt.savefig(project_root / "spam_hypot" / "best_model_prob.png", dpi=150)
 print("Saved best_model_prob.png")
 # Dual axis CTR/CR vs fine bins
 bins = pd.qcut(client["avg_imp_per_day"], 15, duplicates="drop")
 stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
 stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
 stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
 fig, ax1 = plt.subplots(figsize=(12, 5))
 ax2 = ax1.twinx()
 ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
 ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
 ax1.set_ylabel("CTR")
 ax2.set_ylabel("CR click→order")
 ax1.set_xlabel("avg_imp_per_day bins")
 plt.xticks(rotation=35)
 ax1.set_title("CTR и CR по 15 бинам avg_imp_per_day")
 fig.tight_layout()
 plt.savefig(project_root / "spam_hypot" / "best_bins.png", dpi=150)
 print("Saved best_bins.png")
--- a/spam_hypot/best_model_prob.png
+++ b/spam_hypot/best_model_prob.png
--- a/spam_hypot/model_compare.py
+++ b/spam_hypot/model_compare.py
@@ -0,0 +1,80 @@
 import sqlite3
 from pathlib import Path
 import sys
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.metrics import roc_auc_score
 project_root = Path(__file__).resolve().parent.parent
 sys.path.append(str(project_root / "preanalysis"))
 import eda_utils as eda  # noqa: E402
 db_path = project_root / "dataset" / "ds.sqlite"
 conn = sqlite3.connect(db_path)
 df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
 conn.close()
 for cols, name in [
    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
    (eda.ORDER_COLS, "orders_amt_total"),
 ]:
    df[name] = df[cols].sum(axis=1)
 df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
 df["click_total"] = df["active_click_total"] + df["passive_click_total"]
 contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
 client = (
    df.groupby("id")
    .agg(
        imp_total=("imp_total", "sum"),
        click_total=("click_total", "sum"),
        orders_amt_total=("orders_amt_total", "sum"),
        age=("age", "median"),
        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
    )
    .merge(contact_days, on="id", how="left")
    .reset_index()
 )
 client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
 client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
 client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
 X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
 X = X.copy()
 X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
 X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
 y = client["high_ctr"]
 num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
 cat_cols = ["gender_cd", "device_platform_cd"]
 pre = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
 ])
 log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
 gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
 results = {}
 for name, model in [("log_reg", log_reg), ("gb", gb)]:
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    results[name] = roc_auc_score(y_test, proba)
 print("AUC results:", results)
 imp = gb.named_steps["clf"].feature_importances_
 feat = gb.named_steps["pre"].get_feature_names_out()
 imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
 print(imp_df.head(15))
--- a/spam_hypot/stat_analysis.py
+++ b/spam_hypot/stat_analysis.py
@@ -0,0 +1,87 @@
 import sqlite3
 from pathlib import Path
 import sys
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from scipy import stats
 sns.set_theme(style="whitegrid")
 plt.rcParams["figure.figsize"] = (10, 5)
 project_root = Path(__file__).resolve().parent.parent
 sys.path.append(str(project_root / "preanalysis"))
 import eda_utils as eda  # noqa: E402
 db_path = project_root / "dataset" / "ds.sqlite"
 conn = sqlite3.connect(db_path)
 df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
 conn.close()
 for cols, name in [
    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
    (eda.ORDER_COLS, "orders_amt_total"),
 ]:
    df[name] = df[cols].sum(axis=1)
 df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
 df["click_total"] = df["active_click_total"] + df["passive_click_total"]
 contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
 client = (
    df.groupby("id")
    .agg(
        imp_total=("imp_total", "sum"),
        click_total=("click_total", "sum"),
        orders_amt_total=("orders_amt_total", "sum"),
        age=("age", "median"),
        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
    )
    .merge(contact_days, on="id", how="left")
    .reset_index()
 )
 client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
 client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
 client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
 client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
 client["has_order"] = (client["orders_amt_total"] > 0).astype(int)
 # Summary
 summary = client[["imp_total", "click_total", "orders_amt_total", "contact_days", "avg_imp_per_day", "ctr_all", "cr_click2order"]].describe().T
 print("Summary\n", summary)
 missing = client.isna().mean().sort_values(ascending=False)
 print("Missing\n", missing.head(10))
 # Correlations and Mann-Whitney
 corr_ctr = stats.spearmanr(client["avg_imp_per_day"], client["ctr_all"])
 corr_cr = stats.spearmanr(client["avg_imp_per_day"], client["cr_click2order"])
 q1 = client["avg_imp_per_day"].quantile(0.25)
 q4 = client["avg_imp_per_day"].quantile(0.75)
 low = client.loc[client["avg_imp_per_day"] <= q1, "ctr_all"].dropna()
 high = client.loc[client["avg_imp_per_day"] >= q4, "ctr_all"].dropna()
 wu = stats.mannwhitneyu(low, high, alternative="greater")
 print({"spearman_ctr": corr_ctr, "spearman_cr": corr_cr, "mw_low_gt_high": wu})
 # Bin stats and dual-axis plot
 bins = pd.qcut(client["avg_imp_per_day"], 10, duplicates="drop")
 stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
 stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
 stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
 fig, ax1 = plt.subplots(figsize=(12, 5))
 ax2 = ax1.twinx()
 ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
 ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
 ax1.set_ylabel("CTR")
 ax2.set_ylabel("CR click→order")
 ax1.set_xlabel("avg_imp_per_day bins")
 plt.xticks(rotation=35)
 ax1.set_title("CTR и CR по децилям avg_imp_per_day")
 fig.tight_layout()
 plt.savefig(project_root / "spam_hypot" / "stat_bins.png", dpi=150)
 print("Saved plot stat_bins.png")
--- a/spam_hypot/stat_bins.png
+++ b/spam_hypot/stat_bins.png