fully working spam hypot
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -1,154 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, Iterable, List
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Paths and column groups
|
|
||||||
DATA_PATH = Path("dataset/ds.csv")
|
|
||||||
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
|
||||||
|
|
||||||
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
|
||||||
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
|
|
||||||
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
|
||||||
|
|
||||||
NUMERIC_COLS = (
|
|
||||||
ACTIVE_IMP_COLS
|
|
||||||
+ PASSIVE_IMP_COLS
|
|
||||||
+ ACTIVE_CLICK_COLS
|
|
||||||
+ PASSIVE_CLICK_COLS
|
|
||||||
+ ORDER_COLS
|
|
||||||
+ ["age"]
|
|
||||||
)
|
|
||||||
CAT_COLS = ["gender_cd", "device_platform_cd"]
|
|
||||||
|
|
||||||
|
|
||||||
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
|
|
||||||
"""Divide with protection against zero (works for Series and scalars)."""
|
|
||||||
if isinstance(denominator, pd.Series):
|
|
||||||
denom = denominator.replace(0, np.nan)
|
|
||||||
else:
|
|
||||||
denom = np.nan if float(denominator) == 0 else denominator
|
|
||||||
return numerator / denom
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_gender(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
|
|
||||||
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
|
|
||||||
return cleaned.map(mapping).fillna("UNKNOWN")
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_device(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("unknown").astype(str).str.strip()
|
|
||||||
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
|
|
||||||
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
|
|
||||||
mapped = lowered.map(mapping)
|
|
||||||
fallback = cleaned.str.title()
|
|
||||||
return mapped.fillna(fallback)
|
|
||||||
|
|
||||||
|
|
||||||
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
bins = [0, 25, 35, 45, 55, np.inf]
|
|
||||||
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
|
|
||||||
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
|
||||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
|
||||||
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
|
|
||||||
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
|
|
||||||
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
|
|
||||||
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
|
|
||||||
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
df["business_dt"] = pd.to_datetime(df["business_dt"])
|
|
||||||
df["gender_cd"] = normalize_gender(df["gender_cd"])
|
|
||||||
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
|
|
||||||
df = add_age_group(df)
|
|
||||||
df = add_totals(df)
|
|
||||||
df = add_flags(df)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
|
|
||||||
stats = []
|
|
||||||
for col in cols:
|
|
||||||
series = df[col]
|
|
||||||
stats.append(
|
|
||||||
{
|
|
||||||
"col": col,
|
|
||||||
"count": series.count(),
|
|
||||||
"mean": series.mean(),
|
|
||||||
"median": series.median(),
|
|
||||||
"std": series.std(),
|
|
||||||
"min": series.min(),
|
|
||||||
"q25": series.quantile(0.25),
|
|
||||||
"q75": series.quantile(0.75),
|
|
||||||
"max": series.max(),
|
|
||||||
"share_zero": (series == 0).mean(),
|
|
||||||
"p95": series.quantile(0.95),
|
|
||||||
"p99": series.quantile(0.99),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return pd.DataFrame(stats)
|
|
||||||
|
|
||||||
|
|
||||||
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
|
|
||||||
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
|
|
||||||
daily = add_totals(daily)
|
|
||||||
daily["day_of_week"] = daily["business_dt"].dt.day_name()
|
|
||||||
return daily
|
|
||||||
|
|
||||||
|
|
||||||
def build_client(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
|
|
||||||
meta_spec: Dict[str, str | callable] = {
|
|
||||||
"age": "median",
|
|
||||||
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
|
|
||||||
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
|
|
||||||
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
|
|
||||||
}
|
|
||||||
agg_spec.update(meta_spec)
|
|
||||||
client = df.groupby("id").agg(agg_spec).reset_index()
|
|
||||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
|
||||||
imp_day = df.copy()
|
|
||||||
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
|
|
||||||
client = add_totals(client)
|
|
||||||
client = add_flags(client)
|
|
||||||
client = client.merge(contact_days, on="id", how="left")
|
|
||||||
client = client.merge(max_imp_day, on="id", how="left")
|
|
||||||
client = add_contact_density(client)
|
|
||||||
return client
|
|
||||||
|
|
||||||
|
|
||||||
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
# contact_days must already be present
|
|
||||||
if "contact_days" in df.columns:
|
|
||||||
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
|
|
||||||
return df
|
|
||||||
return df
|
|
||||||
File diff suppressed because one or more lines are too long
@@ -1,154 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, Iterable, List
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Paths and column groups
|
|
||||||
DATA_PATH = Path("dataset/ds.csv")
|
|
||||||
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
|
||||||
|
|
||||||
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
|
||||||
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
|
|
||||||
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
|
||||||
|
|
||||||
NUMERIC_COLS = (
|
|
||||||
ACTIVE_IMP_COLS
|
|
||||||
+ PASSIVE_IMP_COLS
|
|
||||||
+ ACTIVE_CLICK_COLS
|
|
||||||
+ PASSIVE_CLICK_COLS
|
|
||||||
+ ORDER_COLS
|
|
||||||
+ ["age"]
|
|
||||||
)
|
|
||||||
CAT_COLS = ["gender_cd", "device_platform_cd"]
|
|
||||||
|
|
||||||
|
|
||||||
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
|
|
||||||
"""Divide with protection against zero (works for Series and scalars)."""
|
|
||||||
if isinstance(denominator, pd.Series):
|
|
||||||
denom = denominator.replace(0, np.nan)
|
|
||||||
else:
|
|
||||||
denom = np.nan if float(denominator) == 0 else denominator
|
|
||||||
return numerator / denom
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_gender(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
|
|
||||||
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
|
|
||||||
return cleaned.map(mapping).fillna("UNKNOWN")
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_device(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("unknown").astype(str).str.strip()
|
|
||||||
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
|
|
||||||
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
|
|
||||||
mapped = lowered.map(mapping)
|
|
||||||
fallback = cleaned.str.title()
|
|
||||||
return mapped.fillna(fallback)
|
|
||||||
|
|
||||||
|
|
||||||
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
bins = [0, 25, 35, 45, 55, np.inf]
|
|
||||||
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
|
|
||||||
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
|
||||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
|
||||||
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
|
|
||||||
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
|
|
||||||
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
|
|
||||||
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
|
|
||||||
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
df["business_dt"] = pd.to_datetime(df["business_dt"])
|
|
||||||
df["gender_cd"] = normalize_gender(df["gender_cd"])
|
|
||||||
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
|
|
||||||
df = add_age_group(df)
|
|
||||||
df = add_totals(df)
|
|
||||||
df = add_flags(df)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
|
|
||||||
stats = []
|
|
||||||
for col in cols:
|
|
||||||
series = df[col]
|
|
||||||
stats.append(
|
|
||||||
{
|
|
||||||
"col": col,
|
|
||||||
"count": series.count(),
|
|
||||||
"mean": series.mean(),
|
|
||||||
"median": series.median(),
|
|
||||||
"std": series.std(),
|
|
||||||
"min": series.min(),
|
|
||||||
"q25": series.quantile(0.25),
|
|
||||||
"q75": series.quantile(0.75),
|
|
||||||
"max": series.max(),
|
|
||||||
"share_zero": (series == 0).mean(),
|
|
||||||
"p95": series.quantile(0.95),
|
|
||||||
"p99": series.quantile(0.99),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return pd.DataFrame(stats)
|
|
||||||
|
|
||||||
|
|
||||||
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
|
|
||||||
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
|
|
||||||
daily = add_totals(daily)
|
|
||||||
daily["day_of_week"] = daily["business_dt"].dt.day_name()
|
|
||||||
return daily
|
|
||||||
|
|
||||||
|
|
||||||
def build_client(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
|
|
||||||
meta_spec: Dict[str, str | callable] = {
|
|
||||||
"age": "median",
|
|
||||||
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
|
|
||||||
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
|
|
||||||
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
|
|
||||||
}
|
|
||||||
agg_spec.update(meta_spec)
|
|
||||||
client = df.groupby("id").agg(agg_spec).reset_index()
|
|
||||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
|
||||||
imp_day = df.copy()
|
|
||||||
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
|
|
||||||
client = add_totals(client)
|
|
||||||
client = add_flags(client)
|
|
||||||
client = client.merge(contact_days, on="id", how="left")
|
|
||||||
client = client.merge(max_imp_day, on="id", how="left")
|
|
||||||
client = add_contact_density(client)
|
|
||||||
return client
|
|
||||||
|
|
||||||
|
|
||||||
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
# contact_days must already be present
|
|
||||||
if "contact_days" in df.columns:
|
|
||||||
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
|
|
||||||
return df
|
|
||||||
return df
|
|
||||||
File diff suppressed because one or more lines are too long
@@ -1,154 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, Iterable, List
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Paths and column groups
|
|
||||||
DATA_PATH = Path("dataset/ds.csv")
|
|
||||||
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
|
||||||
|
|
||||||
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
|
||||||
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
|
|
||||||
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
|
||||||
|
|
||||||
NUMERIC_COLS = (
|
|
||||||
ACTIVE_IMP_COLS
|
|
||||||
+ PASSIVE_IMP_COLS
|
|
||||||
+ ACTIVE_CLICK_COLS
|
|
||||||
+ PASSIVE_CLICK_COLS
|
|
||||||
+ ORDER_COLS
|
|
||||||
+ ["age"]
|
|
||||||
)
|
|
||||||
CAT_COLS = ["gender_cd", "device_platform_cd"]
|
|
||||||
|
|
||||||
|
|
||||||
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
|
|
||||||
"""Divide with protection against zero (works for Series and scalars)."""
|
|
||||||
if isinstance(denominator, pd.Series):
|
|
||||||
denom = denominator.replace(0, np.nan)
|
|
||||||
else:
|
|
||||||
denom = np.nan if float(denominator) == 0 else denominator
|
|
||||||
return numerator / denom
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_gender(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
|
|
||||||
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
|
|
||||||
return cleaned.map(mapping).fillna("UNKNOWN")
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_device(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("unknown").astype(str).str.strip()
|
|
||||||
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
|
|
||||||
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
|
|
||||||
mapped = lowered.map(mapping)
|
|
||||||
fallback = cleaned.str.title()
|
|
||||||
return mapped.fillna(fallback)
|
|
||||||
|
|
||||||
|
|
||||||
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
bins = [0, 25, 35, 45, 55, np.inf]
|
|
||||||
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
|
|
||||||
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
|
||||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
|
||||||
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
|
|
||||||
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
|
|
||||||
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
|
|
||||||
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
|
|
||||||
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
df["business_dt"] = pd.to_datetime(df["business_dt"])
|
|
||||||
df["gender_cd"] = normalize_gender(df["gender_cd"])
|
|
||||||
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
|
|
||||||
df = add_age_group(df)
|
|
||||||
df = add_totals(df)
|
|
||||||
df = add_flags(df)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
|
|
||||||
stats = []
|
|
||||||
for col in cols:
|
|
||||||
series = df[col]
|
|
||||||
stats.append(
|
|
||||||
{
|
|
||||||
"col": col,
|
|
||||||
"count": series.count(),
|
|
||||||
"mean": series.mean(),
|
|
||||||
"median": series.median(),
|
|
||||||
"std": series.std(),
|
|
||||||
"min": series.min(),
|
|
||||||
"q25": series.quantile(0.25),
|
|
||||||
"q75": series.quantile(0.75),
|
|
||||||
"max": series.max(),
|
|
||||||
"share_zero": (series == 0).mean(),
|
|
||||||
"p95": series.quantile(0.95),
|
|
||||||
"p99": series.quantile(0.99),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return pd.DataFrame(stats)
|
|
||||||
|
|
||||||
|
|
||||||
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
|
|
||||||
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
|
|
||||||
daily = add_totals(daily)
|
|
||||||
daily["day_of_week"] = daily["business_dt"].dt.day_name()
|
|
||||||
return daily
|
|
||||||
|
|
||||||
|
|
||||||
def build_client(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
|
|
||||||
meta_spec: Dict[str, str | callable] = {
|
|
||||||
"age": "median",
|
|
||||||
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
|
|
||||||
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
|
|
||||||
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
|
|
||||||
}
|
|
||||||
agg_spec.update(meta_spec)
|
|
||||||
client = df.groupby("id").agg(agg_spec).reset_index()
|
|
||||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
|
||||||
imp_day = df.copy()
|
|
||||||
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
|
|
||||||
client = add_totals(client)
|
|
||||||
client = add_flags(client)
|
|
||||||
client = client.merge(contact_days, on="id", how="left")
|
|
||||||
client = client.merge(max_imp_day, on="id", how="left")
|
|
||||||
client = add_contact_density(client)
|
|
||||||
return client
|
|
||||||
|
|
||||||
|
|
||||||
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
# contact_days must already be present
|
|
||||||
if "contact_days" in df.columns:
|
|
||||||
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
|
|
||||||
return df
|
|
||||||
return df
|
|
||||||
File diff suppressed because one or more lines are too long
@@ -1,154 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, Iterable, List
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Paths and column groups
|
|
||||||
DATA_PATH = Path("dataset/ds.csv")
|
|
||||||
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
|
||||||
|
|
||||||
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
|
||||||
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
|
|
||||||
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
|
||||||
|
|
||||||
NUMERIC_COLS = (
|
|
||||||
ACTIVE_IMP_COLS
|
|
||||||
+ PASSIVE_IMP_COLS
|
|
||||||
+ ACTIVE_CLICK_COLS
|
|
||||||
+ PASSIVE_CLICK_COLS
|
|
||||||
+ ORDER_COLS
|
|
||||||
+ ["age"]
|
|
||||||
)
|
|
||||||
CAT_COLS = ["gender_cd", "device_platform_cd"]
|
|
||||||
|
|
||||||
|
|
||||||
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
|
|
||||||
"""Divide with protection against zero (works for Series and scalars)."""
|
|
||||||
if isinstance(denominator, pd.Series):
|
|
||||||
denom = denominator.replace(0, np.nan)
|
|
||||||
else:
|
|
||||||
denom = np.nan if float(denominator) == 0 else denominator
|
|
||||||
return numerator / denom
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_gender(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
|
|
||||||
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
|
|
||||||
return cleaned.map(mapping).fillna("UNKNOWN")
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_device(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("unknown").astype(str).str.strip()
|
|
||||||
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
|
|
||||||
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
|
|
||||||
mapped = lowered.map(mapping)
|
|
||||||
fallback = cleaned.str.title()
|
|
||||||
return mapped.fillna(fallback)
|
|
||||||
|
|
||||||
|
|
||||||
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
bins = [0, 25, 35, 45, 55, np.inf]
|
|
||||||
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
|
|
||||||
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
|
||||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
|
||||||
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
|
|
||||||
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
|
|
||||||
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
|
|
||||||
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
|
|
||||||
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
df["business_dt"] = pd.to_datetime(df["business_dt"])
|
|
||||||
df["gender_cd"] = normalize_gender(df["gender_cd"])
|
|
||||||
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
|
|
||||||
df = add_age_group(df)
|
|
||||||
df = add_totals(df)
|
|
||||||
df = add_flags(df)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
|
|
||||||
stats = []
|
|
||||||
for col in cols:
|
|
||||||
series = df[col]
|
|
||||||
stats.append(
|
|
||||||
{
|
|
||||||
"col": col,
|
|
||||||
"count": series.count(),
|
|
||||||
"mean": series.mean(),
|
|
||||||
"median": series.median(),
|
|
||||||
"std": series.std(),
|
|
||||||
"min": series.min(),
|
|
||||||
"q25": series.quantile(0.25),
|
|
||||||
"q75": series.quantile(0.75),
|
|
||||||
"max": series.max(),
|
|
||||||
"share_zero": (series == 0).mean(),
|
|
||||||
"p95": series.quantile(0.95),
|
|
||||||
"p99": series.quantile(0.99),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return pd.DataFrame(stats)
|
|
||||||
|
|
||||||
|
|
||||||
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
|
|
||||||
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
|
|
||||||
daily = add_totals(daily)
|
|
||||||
daily["day_of_week"] = daily["business_dt"].dt.day_name()
|
|
||||||
return daily
|
|
||||||
|
|
||||||
|
|
||||||
def build_client(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
|
|
||||||
meta_spec: Dict[str, str | callable] = {
|
|
||||||
"age": "median",
|
|
||||||
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
|
|
||||||
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
|
|
||||||
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
|
|
||||||
}
|
|
||||||
agg_spec.update(meta_spec)
|
|
||||||
client = df.groupby("id").agg(agg_spec).reset_index()
|
|
||||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
|
||||||
imp_day = df.copy()
|
|
||||||
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
|
|
||||||
client = add_totals(client)
|
|
||||||
client = add_flags(client)
|
|
||||||
client = client.merge(contact_days, on="id", how="left")
|
|
||||||
client = client.merge(max_imp_day, on="id", how="left")
|
|
||||||
client = add_contact_density(client)
|
|
||||||
return client
|
|
||||||
|
|
||||||
|
|
||||||
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
# contact_days must already be present
|
|
||||||
if "contact_days" in df.columns:
|
|
||||||
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
|
|
||||||
return df
|
|
||||||
return df
|
|
||||||
File diff suppressed because one or more lines are too long
@@ -1,154 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, Iterable, List
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Paths and column groups
|
|
||||||
DATA_PATH = Path("dataset/ds.csv")
|
|
||||||
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
|
||||||
|
|
||||||
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
|
||||||
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
|
|
||||||
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
|
||||||
|
|
||||||
NUMERIC_COLS = (
|
|
||||||
ACTIVE_IMP_COLS
|
|
||||||
+ PASSIVE_IMP_COLS
|
|
||||||
+ ACTIVE_CLICK_COLS
|
|
||||||
+ PASSIVE_CLICK_COLS
|
|
||||||
+ ORDER_COLS
|
|
||||||
+ ["age"]
|
|
||||||
)
|
|
||||||
CAT_COLS = ["gender_cd", "device_platform_cd"]
|
|
||||||
|
|
||||||
|
|
||||||
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
|
|
||||||
"""Divide with protection against zero (works for Series and scalars)."""
|
|
||||||
if isinstance(denominator, pd.Series):
|
|
||||||
denom = denominator.replace(0, np.nan)
|
|
||||||
else:
|
|
||||||
denom = np.nan if float(denominator) == 0 else denominator
|
|
||||||
return numerator / denom
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_gender(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
|
|
||||||
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
|
|
||||||
return cleaned.map(mapping).fillna("UNKNOWN")
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_device(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("unknown").astype(str).str.strip()
|
|
||||||
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
|
|
||||||
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
|
|
||||||
mapped = lowered.map(mapping)
|
|
||||||
fallback = cleaned.str.title()
|
|
||||||
return mapped.fillna(fallback)
|
|
||||||
|
|
||||||
|
|
||||||
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
bins = [0, 25, 35, 45, 55, np.inf]
|
|
||||||
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
|
|
||||||
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
|
||||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
|
||||||
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
|
|
||||||
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
|
|
||||||
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
|
|
||||||
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
|
|
||||||
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
df["business_dt"] = pd.to_datetime(df["business_dt"])
|
|
||||||
df["gender_cd"] = normalize_gender(df["gender_cd"])
|
|
||||||
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
|
|
||||||
df = add_age_group(df)
|
|
||||||
df = add_totals(df)
|
|
||||||
df = add_flags(df)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
|
|
||||||
stats = []
|
|
||||||
for col in cols:
|
|
||||||
series = df[col]
|
|
||||||
stats.append(
|
|
||||||
{
|
|
||||||
"col": col,
|
|
||||||
"count": series.count(),
|
|
||||||
"mean": series.mean(),
|
|
||||||
"median": series.median(),
|
|
||||||
"std": series.std(),
|
|
||||||
"min": series.min(),
|
|
||||||
"q25": series.quantile(0.25),
|
|
||||||
"q75": series.quantile(0.75),
|
|
||||||
"max": series.max(),
|
|
||||||
"share_zero": (series == 0).mean(),
|
|
||||||
"p95": series.quantile(0.95),
|
|
||||||
"p99": series.quantile(0.99),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return pd.DataFrame(stats)
|
|
||||||
|
|
||||||
|
|
||||||
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
|
|
||||||
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
|
|
||||||
daily = add_totals(daily)
|
|
||||||
daily["day_of_week"] = daily["business_dt"].dt.day_name()
|
|
||||||
return daily
|
|
||||||
|
|
||||||
|
|
||||||
def build_client(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
|
|
||||||
meta_spec: Dict[str, str | callable] = {
|
|
||||||
"age": "median",
|
|
||||||
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
|
|
||||||
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
|
|
||||||
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
|
|
||||||
}
|
|
||||||
agg_spec.update(meta_spec)
|
|
||||||
client = df.groupby("id").agg(agg_spec).reset_index()
|
|
||||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
|
||||||
imp_day = df.copy()
|
|
||||||
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
|
|
||||||
client = add_totals(client)
|
|
||||||
client = add_flags(client)
|
|
||||||
client = client.merge(contact_days, on="id", how="left")
|
|
||||||
client = client.merge(max_imp_day, on="id", how="left")
|
|
||||||
client = add_contact_density(client)
|
|
||||||
return client
|
|
||||||
|
|
||||||
|
|
||||||
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
# contact_days must already be present
|
|
||||||
if "contact_days" in df.columns:
|
|
||||||
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
|
|
||||||
return df
|
|
||||||
return df
|
|
||||||
File diff suppressed because one or more lines are too long
@@ -1,154 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, Iterable, List
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Paths and column groups
|
|
||||||
DATA_PATH = Path("dataset/ds.csv")
|
|
||||||
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
|
||||||
|
|
||||||
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
|
||||||
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
|
|
||||||
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
|
|
||||||
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
|
||||||
|
|
||||||
NUMERIC_COLS = (
|
|
||||||
ACTIVE_IMP_COLS
|
|
||||||
+ PASSIVE_IMP_COLS
|
|
||||||
+ ACTIVE_CLICK_COLS
|
|
||||||
+ PASSIVE_CLICK_COLS
|
|
||||||
+ ORDER_COLS
|
|
||||||
+ ["age"]
|
|
||||||
)
|
|
||||||
CAT_COLS = ["gender_cd", "device_platform_cd"]
|
|
||||||
|
|
||||||
|
|
||||||
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
|
|
||||||
"""Divide with protection against zero (works for Series and scalars)."""
|
|
||||||
if isinstance(denominator, pd.Series):
|
|
||||||
denom = denominator.replace(0, np.nan)
|
|
||||||
else:
|
|
||||||
denom = np.nan if float(denominator) == 0 else denominator
|
|
||||||
return numerator / denom
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_gender(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
|
|
||||||
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
|
|
||||||
return cleaned.map(mapping).fillna("UNKNOWN")
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_device(series: pd.Series) -> pd.Series:
|
|
||||||
cleaned = series.fillna("unknown").astype(str).str.strip()
|
|
||||||
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
|
|
||||||
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
|
|
||||||
mapped = lowered.map(mapping)
|
|
||||||
fallback = cleaned.str.title()
|
|
||||||
return mapped.fillna(fallback)
|
|
||||||
|
|
||||||
|
|
||||||
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
bins = [0, 25, 35, 45, 55, np.inf]
|
|
||||||
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
|
|
||||||
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
|
|
||||||
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
|
||||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
|
||||||
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
|
|
||||||
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
|
|
||||||
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
|
|
||||||
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
|
|
||||||
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
|
|
||||||
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
df["business_dt"] = pd.to_datetime(df["business_dt"])
|
|
||||||
df["gender_cd"] = normalize_gender(df["gender_cd"])
|
|
||||||
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
|
|
||||||
df = add_age_group(df)
|
|
||||||
df = add_totals(df)
|
|
||||||
df = add_flags(df)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
|
|
||||||
stats = []
|
|
||||||
for col in cols:
|
|
||||||
series = df[col]
|
|
||||||
stats.append(
|
|
||||||
{
|
|
||||||
"col": col,
|
|
||||||
"count": series.count(),
|
|
||||||
"mean": series.mean(),
|
|
||||||
"median": series.median(),
|
|
||||||
"std": series.std(),
|
|
||||||
"min": series.min(),
|
|
||||||
"q25": series.quantile(0.25),
|
|
||||||
"q75": series.quantile(0.75),
|
|
||||||
"max": series.max(),
|
|
||||||
"share_zero": (series == 0).mean(),
|
|
||||||
"p95": series.quantile(0.95),
|
|
||||||
"p99": series.quantile(0.99),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return pd.DataFrame(stats)
|
|
||||||
|
|
||||||
|
|
||||||
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
|
|
||||||
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
|
|
||||||
daily = add_totals(daily)
|
|
||||||
daily["day_of_week"] = daily["business_dt"].dt.day_name()
|
|
||||||
return daily
|
|
||||||
|
|
||||||
|
|
||||||
def build_client(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
|
|
||||||
meta_spec: Dict[str, str | callable] = {
|
|
||||||
"age": "median",
|
|
||||||
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
|
|
||||||
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
|
|
||||||
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
|
|
||||||
}
|
|
||||||
agg_spec.update(meta_spec)
|
|
||||||
client = df.groupby("id").agg(agg_spec).reset_index()
|
|
||||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
|
||||||
imp_day = df.copy()
|
|
||||||
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
|
||||||
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
|
|
||||||
client = add_totals(client)
|
|
||||||
client = add_flags(client)
|
|
||||||
client = client.merge(contact_days, on="id", how="left")
|
|
||||||
client = client.merge(max_imp_day, on="id", how="left")
|
|
||||||
client = add_contact_density(client)
|
|
||||||
return client
|
|
||||||
|
|
||||||
|
|
||||||
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
# contact_days must already be present
|
|
||||||
if "contact_days" in df.columns:
|
|
||||||
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
|
|
||||||
return df
|
|
||||||
return df
|
|
||||||
@@ -1,188 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "4d7d3347",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Спам-гипотеза: плотность показов vs CTR/CR\n",
|
|
||||||
"\n",
|
|
||||||
"Цель: проверить, что высокая плотность показов на контактный день снижает CTR и CR (спам-эффект)."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "7acbd1c8",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import sqlite3\n",
|
|
||||||
"from pathlib import Path\n",
|
|
||||||
"import sys\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import seaborn as sns\n",
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"from scipy import stats\n",
|
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
|
||||||
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|
||||||
"from sklearn.compose import ColumnTransformer\n",
|
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
|
||||||
"from sklearn.impute import SimpleImputer\n",
|
|
||||||
"from sklearn.metrics import roc_auc_score\n",
|
|
||||||
"\n",
|
|
||||||
"sns.set_theme(style=\"whitegrid\")\n",
|
|
||||||
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
|
|
||||||
"\n",
|
|
||||||
"project_root = Path.cwd().resolve()\n",
|
|
||||||
"while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
|
|
||||||
" project_root = project_root.parent\n",
|
|
||||||
"sys.path.append(str(project_root / \"preanalysis\"))\n",
|
|
||||||
"import eda_utils as eda\n",
|
|
||||||
"\n",
|
|
||||||
"db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
|
|
||||||
"conn = sqlite3.connect(db_path)\n",
|
|
||||||
"df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
|
|
||||||
"conn.close()\n",
|
|
||||||
"\n",
|
|
||||||
"for cols, name in [\n",
|
|
||||||
" (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
|
|
||||||
" (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
|
|
||||||
" (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
|
|
||||||
" (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
|
|
||||||
" (eda.ORDER_COLS, \"orders_amt_total\"),\n",
|
|
||||||
"]:\n",
|
|
||||||
" df[name] = df[cols].sum(axis=1)\n",
|
|
||||||
"\n",
|
|
||||||
"df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
|
|
||||||
"df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
|
|
||||||
"\n",
|
|
||||||
"contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
|
|
||||||
"client = df.groupby(\"id\").agg(\n",
|
|
||||||
" {\n",
|
|
||||||
" \"imp_total\": \"sum\",\n",
|
|
||||||
" \"click_total\": \"sum\",\n",
|
|
||||||
" \"orders_amt_total\": \"sum\",\n",
|
|
||||||
" \"age\": \"median\",\n",
|
|
||||||
" \"gender_cd\": lambda s: s.mode().iat[0],\n",
|
|
||||||
" \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
|
|
||||||
" }\n",
|
|
||||||
").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
|
|
||||||
"\n",
|
|
||||||
"client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
|
|
||||||
"client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
|
|
||||||
"client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
|
|
||||||
"client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
|
|
||||||
"client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "94eb2d26",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Базовые статистики"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "287a09b4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"summary = client[[\"imp_total\", \"click_total\", \"orders_amt_total\", \"contact_days\", \"avg_imp_per_day\", \"ctr_all\", \"cr_click2order\"]].describe().T\n",
|
|
||||||
"missing = client.isna().mean().sort_values(ascending=False)\n",
|
|
||||||
"summary, missing.head(10)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "10cd44b7",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Корреляции и тесты\n",
|
|
||||||
"Спирмен между плотностью и CTR/CR, а также Mann–Whitney между Q1 и Q4 по плотности."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "88714a03",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"corr_ctr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"ctr_all\"])\n",
|
|
||||||
"corr_cr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"cr_click2order\"])\n",
|
|
||||||
"q1 = client[\"avg_imp_per_day\"].quantile(0.25)\n",
|
|
||||||
"q4 = client[\"avg_imp_per_day\"].quantile(0.75)\n",
|
|
||||||
"low = client.loc[client[\"avg_imp_per_day\"] <= q1, \"ctr_all\"].dropna()\n",
|
|
||||||
"high = client.loc[client[\"avg_imp_per_day\"] >= q4, \"ctr_all\"].dropna()\n",
|
|
||||||
"wu = stats.mannwhitneyu(low, high, alternative=\"greater\")\n",
|
|
||||||
"{ \"spearman_ctr\": corr_ctr, \"spearman_cr\": corr_cr, \"mw_low_gt_high\": wu }\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "20d492fa",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
|
|
||||||
"stats_bin = client.groupby(bins, observed=False).agg(\n",
|
|
||||||
" ctr_all=(\"ctr_all\", \"median\"),\n",
|
|
||||||
" cr_click2order=(\"cr_click2order\", \"median\"),\n",
|
|
||||||
" avg_imp_per_day=(\"avg_imp_per_day\", \"median\"),\n",
|
|
||||||
").reset_index()\n",
|
|
||||||
"stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
|
|
||||||
"fig, ax1 = plt.subplots(figsize=(12, 5))\n",
|
|
||||||
"ax2 = ax1.twinx()\n",
|
|
||||||
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
|
|
||||||
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
|
|
||||||
"ax1.set_ylabel(\"CTR\")\n",
|
|
||||||
"ax2.set_ylabel(\"CR click→order\")\n",
|
|
||||||
"plt.xticks(rotation=35)\n",
|
|
||||||
"ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
|
|
||||||
"fig.tight_layout()\n",
|
|
||||||
"plt.show()\n",
|
|
||||||
"stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "943f0d4b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
|
|
||||||
"stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
|
|
||||||
"stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
|
|
||||||
"fig, ax1 = plt.subplots(figsize=(12, 5))\n",
|
|
||||||
"ax2 = ax1.twinx()\n",
|
|
||||||
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
|
|
||||||
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
|
|
||||||
"ax1.set_ylabel(\"CTR\")\n",
|
|
||||||
"ax2.set_ylabel(\"CR click→order\")\n",
|
|
||||||
"plt.xticks(rotation=35)\n",
|
|
||||||
"ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
|
|
||||||
"fig.tight_layout()\n",
|
|
||||||
"plt.show()\n",
|
|
||||||
"stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python",
|
|
||||||
"version": "3.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
@@ -1,161 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "7254b4c1",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Спам-гипотеза: сравнение моделей\n",
|
|
||||||
"\n",
|
|
||||||
"Target: `high_ctr` (верхний квартиль CTR)."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "c7f54168",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import sqlite3\n",
|
|
||||||
"from pathlib import Path\n",
|
|
||||||
"import sys\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import seaborn as sns\n",
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"from scipy import stats\n",
|
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
|
||||||
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|
||||||
"from sklearn.compose import ColumnTransformer\n",
|
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
|
||||||
"from sklearn.impute import SimpleImputer\n",
|
|
||||||
"from sklearn.metrics import roc_auc_score\n",
|
|
||||||
"\n",
|
|
||||||
"sns.set_theme(style=\"whitegrid\")\n",
|
|
||||||
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
|
|
||||||
"\n",
|
|
||||||
"project_root = Path.cwd().resolve()\n",
|
|
||||||
"while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
|
|
||||||
" project_root = project_root.parent\n",
|
|
||||||
"sys.path.append(str(project_root / \"preanalysis\"))\n",
|
|
||||||
"import eda_utils as eda\n",
|
|
||||||
"\n",
|
|
||||||
"db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
|
|
||||||
"conn = sqlite3.connect(db_path)\n",
|
|
||||||
"df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
|
|
||||||
"conn.close()\n",
|
|
||||||
"\n",
|
|
||||||
"for cols, name in [\n",
|
|
||||||
" (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
|
|
||||||
" (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
|
|
||||||
" (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
|
|
||||||
" (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
|
|
||||||
" (eda.ORDER_COLS, \"orders_amt_total\"),\n",
|
|
||||||
"]:\n",
|
|
||||||
" df[name] = df[cols].sum(axis=1)\n",
|
|
||||||
"\n",
|
|
||||||
"df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
|
|
||||||
"df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
|
|
||||||
"\n",
|
|
||||||
"contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
|
|
||||||
"client = df.groupby(\"id\").agg(\n",
|
|
||||||
" {\n",
|
|
||||||
" \"imp_total\": \"sum\",\n",
|
|
||||||
" \"click_total\": \"sum\",\n",
|
|
||||||
" \"orders_amt_total\": \"sum\",\n",
|
|
||||||
" \"age\": \"median\",\n",
|
|
||||||
" \"gender_cd\": lambda s: s.mode().iat[0],\n",
|
|
||||||
" \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
|
|
||||||
" }\n",
|
|
||||||
").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
|
|
||||||
"\n",
|
|
||||||
"client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
|
|
||||||
"client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
|
|
||||||
"client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
|
|
||||||
"client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
|
|
||||||
"client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "21786c63",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Модели: Logistic Regression vs GradientBoosting"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "dc8dbc94",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
|
|
||||||
"X = X.copy()\n",
|
|
||||||
"X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
|
|
||||||
"X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
|
|
||||||
"y = client[\"high_ctr\"]\n",
|
|
||||||
"\n",
|
|
||||||
"num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n",
|
|
||||||
"cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
|
|
||||||
"pre = ColumnTransformer([\n",
|
|
||||||
" (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n",
|
|
||||||
" (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
|
|
||||||
"])\n",
|
|
||||||
"\n",
|
|
||||||
"log_reg = Pipeline([(\"pre\", pre), (\"clf\", LogisticRegression(max_iter=1000))])\n",
|
|
||||||
"gb = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n",
|
|
||||||
"\n",
|
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
|
|
||||||
"res = {}\n",
|
|
||||||
"for name, model in [(\"log_reg\", log_reg), (\"gb\", gb)]:\n",
|
|
||||||
" model.fit(X_train, y_train)\n",
|
|
||||||
" proba = model.predict_proba(X_test)[:, 1]\n",
|
|
||||||
" res[name] = roc_auc_score(y_test, proba)\n",
|
|
||||||
"res\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "203acf70",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Важности признаков (GradientBoosting)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "3eac9e17",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"gb_model = gb\n",
|
|
||||||
"feat_names = gb_model.named_steps[\"pre\"].get_feature_names_out()\n",
|
|
||||||
"importances = gb_model.named_steps[\"clf\"].feature_importances_\n",
|
|
||||||
"imp_df = pd.DataFrame({\"feature\": feat_names, \"importance\": importances}).sort_values(\"importance\", ascending=False)\n",
|
|
||||||
"plt.figure(figsize=(8, 5))\n",
|
|
||||||
"sns.barplot(data=imp_df.head(15), x=\"importance\", y=\"feature\", palette=\"viridis\")\n",
|
|
||||||
"plt.title(\"Top-15 feature importances (GB)\")\n",
|
|
||||||
"plt.tight_layout()\n",
|
|
||||||
"plt.show()\n",
|
|
||||||
"imp_df.head(15)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python",
|
|
||||||
"version": "3.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
@@ -1,206 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "d88bf2d8",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Спам-гипотеза: лучшая модель и визуализации\n",
|
|
||||||
"\n",
|
|
||||||
"Используем GradientBoostingClassifier (лучше логрега по AUC) для подтверждения гипотезы."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "87f3f728",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import sqlite3\n",
|
|
||||||
"from pathlib import Path\n",
|
|
||||||
"import sys\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import seaborn as sns\n",
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"from scipy import stats\n",
|
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
|
||||||
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|
||||||
"from sklearn.compose import ColumnTransformer\n",
|
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
|
||||||
"from sklearn.impute import SimpleImputer\n",
|
|
||||||
"from sklearn.metrics import roc_auc_score\n",
|
|
||||||
"\n",
|
|
||||||
"sns.set_theme(style=\"whitegrid\")\n",
|
|
||||||
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
|
|
||||||
"\n",
|
|
||||||
"project_root = Path.cwd().resolve()\n",
|
|
||||||
"while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
|
|
||||||
" project_root = project_root.parent\n",
|
|
||||||
"sys.path.append(str(project_root / \"preanalysis\"))\n",
|
|
||||||
"import eda_utils as eda\n",
|
|
||||||
"\n",
|
|
||||||
"db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
|
|
||||||
"conn = sqlite3.connect(db_path)\n",
|
|
||||||
"df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
|
|
||||||
"conn.close()\n",
|
|
||||||
"\n",
|
|
||||||
"for cols, name in [\n",
|
|
||||||
" (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
|
|
||||||
" (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
|
|
||||||
" (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
|
|
||||||
" (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
|
|
||||||
" (eda.ORDER_COLS, \"orders_amt_total\"),\n",
|
|
||||||
"]:\n",
|
|
||||||
" df[name] = df[cols].sum(axis=1)\n",
|
|
||||||
"\n",
|
|
||||||
"df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
|
|
||||||
"df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
|
|
||||||
"\n",
|
|
||||||
"contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
|
|
||||||
"client = df.groupby(\"id\").agg(\n",
|
|
||||||
" {\n",
|
|
||||||
" \"imp_total\": \"sum\",\n",
|
|
||||||
" \"click_total\": \"sum\",\n",
|
|
||||||
" \"orders_amt_total\": \"sum\",\n",
|
|
||||||
" \"age\": \"median\",\n",
|
|
||||||
" \"gender_cd\": lambda s: s.mode().iat[0],\n",
|
|
||||||
" \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
|
|
||||||
" }\n",
|
|
||||||
").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
|
|
||||||
"\n",
|
|
||||||
"client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
|
|
||||||
"client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
|
|
||||||
"client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
|
|
||||||
"client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
|
|
||||||
"client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "17da010c",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Обучение лучшей модели"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "81433d7e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
|
|
||||||
"X = X.copy()\n",
|
|
||||||
"X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
|
|
||||||
"X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
|
|
||||||
"y = client[\"high_ctr\"]\n",
|
|
||||||
"\n",
|
|
||||||
"num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n",
|
|
||||||
"cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
|
|
||||||
"pre = ColumnTransformer([\n",
|
|
||||||
" (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n",
|
|
||||||
" (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
|
|
||||||
"])\n",
|
|
||||||
"\n",
|
|
||||||
"best = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n",
|
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
|
|
||||||
"best.fit(X_train, y_train)\n",
|
|
||||||
"proba = best.predict_proba(X_test)[:, 1]\n",
|
|
||||||
"auc = roc_auc_score(y_test, proba)\n",
|
|
||||||
"auc\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "63f4db9b",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Прогноз vs плотность показов"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "f48584b5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"grid = pd.DataFrame({\"avg_imp_per_day\": np.linspace(client[\"avg_imp_per_day\"].min(), client[\"avg_imp_per_day\"].max(), 50)})\n",
|
|
||||||
"base = client.median(numeric_only=True)\n",
|
|
||||||
"base_gender = client[\"gender_cd\"].mode().iat[0]\n",
|
|
||||||
"base_device = client[\"device_platform_cd\"].mode().iat[0]\n",
|
|
||||||
"grid[\"imp_total\"] = base[\"imp_total\"]\n",
|
|
||||||
"grid[\"click_total\"] = base[\"click_total\"]\n",
|
|
||||||
"grid[\"age\"] = base[\"age\"]\n",
|
|
||||||
"grid[\"gender_cd\"] = base_gender\n",
|
|
||||||
"grid[\"device_platform_cd\"] = base_device\n",
|
|
||||||
"proba_grid = best.predict_proba(grid)[:, 1]\n",
|
|
||||||
"plt.figure(figsize=(10, 4))\n",
|
|
||||||
"plt.plot(grid[\"avg_imp_per_day\"], proba_grid, marker=\"o\")\n",
|
|
||||||
"plt.xlabel(\"avg_imp_per_day\")\n",
|
|
||||||
"plt.ylabel(\"P(high CTR)\")\n",
|
|
||||||
"plt.title(\"Предсказанная вероятность высокого CTR vs плотность показов\")\n",
|
|
||||||
"plt.tight_layout()\n",
|
|
||||||
"plt.show()\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "32f73b44",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## График CTR и CR по тонким бинам (две оси)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "bb4d0190",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"bins = pd.qcut(client[\"avg_imp_per_day\"], 15, duplicates=\"drop\")\n",
|
|
||||||
"stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
|
|
||||||
"stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
|
|
||||||
"fig, ax1 = plt.subplots(figsize=(12, 5))\n",
|
|
||||||
"ax2 = ax1.twinx()\n",
|
|
||||||
"ax1.plot(stats_bin[\"bin_label\"], stats_bin[\"ctr_all\"], marker=\"o\", color=\"#4c72b0\", label=\"CTR\")\n",
|
|
||||||
"ax2.plot(stats_bin[\"bin_label\"], stats_bin[\"cr_click2order\"], marker=\"s\", color=\"#c44e52\", label=\"CR\")\n",
|
|
||||||
"ax1.set_ylabel(\"CTR\")\n",
|
|
||||||
"ax2.set_ylabel(\"CR click→order\")\n",
|
|
||||||
"ax1.set_xlabel(\"avg_imp_per_day bins\")\n",
|
|
||||||
"plt.xticks(rotation=35)\n",
|
|
||||||
"ax1.set_title(\"CTR и CR по 15 бинам avg_imp_per_day\")\n",
|
|
||||||
"fig.tight_layout()\n",
|
|
||||||
"plt.show()\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "ebb2ca5e",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Вывод\n",
|
|
||||||
"- AUC модели GradientBoosting > логрега; `avg_imp_per_day` ключевой драйвер: рост плотности снижает шанс попасть в верхний квартиль CTR.\n",
|
|
||||||
"- Биновые графики показывают монотонное падение CTR и CR при росте avg_imp_per_day.\n",
|
|
||||||
"- Гипотеза о спам-эффекте подтверждается как статистически, так и по ML-модели."
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python",
|
|
||||||
"version": "3.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 109 KiB After Width: | Height: | Size: 100 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 45 KiB After Width: | Height: | Size: 41 KiB |
@@ -46,32 +46,62 @@ client = (
|
|||||||
.merge(contact_days, on="id", how="left")
|
.merge(contact_days, on="id", how="left")
|
||||||
.reset_index()
|
.reset_index()
|
||||||
)
|
)
|
||||||
|
# ... всё как у тебя до расчёта client["ctr_all"] включительно
|
||||||
|
|
||||||
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
|
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
|
||||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
||||||
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
|
|
||||||
|
|
||||||
X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
|
# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
|
||||||
X = X.copy()
|
train_idx, test_idx = train_test_split(
|
||||||
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
|
client.index, test_size=0.2, random_state=42
|
||||||
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
|
)
|
||||||
y = client["high_ctr"]
|
|
||||||
|
|
||||||
num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
|
train = client.loc[train_idx].copy()
|
||||||
|
test = client.loc[test_idx].copy()
|
||||||
|
|
||||||
|
thr = train["ctr_all"].quantile(0.75) # порог только по train
|
||||||
|
train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
|
||||||
|
test["high_ctr"] = (test["ctr_all"] >= thr).astype(int)
|
||||||
|
|
||||||
|
# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
|
||||||
|
X_train = train[[
|
||||||
|
"avg_imp_per_day", "imp_total", "contact_days", # можно оставить
|
||||||
|
"age", "gender_cd", "device_platform_cd"
|
||||||
|
]].copy()
|
||||||
|
X_test = test[[
|
||||||
|
"avg_imp_per_day", "imp_total", "contact_days",
|
||||||
|
"age", "gender_cd", "device_platform_cd"
|
||||||
|
]].copy()
|
||||||
|
|
||||||
|
X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
|
||||||
|
X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
|
||||||
|
X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
|
||||||
|
X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
|
||||||
|
|
||||||
|
y_train = train["high_ctr"]
|
||||||
|
y_test = test["high_ctr"]
|
||||||
|
|
||||||
|
num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
|
||||||
cat_cols = ["gender_cd", "device_platform_cd"]
|
cat_cols = ["gender_cd", "device_platform_cd"]
|
||||||
|
|
||||||
pre = ColumnTransformer([
|
pre = ColumnTransformer([
|
||||||
("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
|
("num", Pipeline([
|
||||||
|
("imputer", SimpleImputer(strategy="median")),
|
||||||
|
("scaler", StandardScaler())
|
||||||
|
]), num_cols),
|
||||||
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
|
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
|
||||||
])
|
])
|
||||||
|
|
||||||
log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
|
log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
|
||||||
gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
|
gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
|
||||||
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
|
||||||
results = {}
|
results = {}
|
||||||
for name, model in [("log_reg", log_reg), ("gb", gb)]:
|
for name, model in [("log_reg", log_reg), ("gb", gb)]:
|
||||||
model.fit(X_train, y_train)
|
model.fit(X_train, y_train)
|
||||||
proba = model.predict_proba(X_test)[:, 1]
|
proba = model.predict_proba(X_test)[:, 1]
|
||||||
results[name] = roc_auc_score(y_test, proba)
|
results[name] = roc_auc_score(y_test, proba)
|
||||||
|
|
||||||
|
print("CTR threshold (train 0.75q):", thr)
|
||||||
print("AUC results:", results)
|
print("AUC results:", results)
|
||||||
|
|
||||||
imp = gb.named_steps["clf"].feature_importances_
|
imp = gb.named_steps["clf"].feature_importances_
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 97 KiB After Width: | Height: | Size: 87 KiB |
Reference in New Issue
Block a user