add full script
This commit is contained in:
@@ -5,13 +5,6 @@ import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
sns.set_theme(style="whitegrid")
|
||||
plt.rcParams["figure.figsize"] = (10, 5)
|
||||
@@ -54,43 +47,6 @@ client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_t
|
||||
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
|
||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
||||
|
||||
# таргет: высокий orders/impressions
|
||||
client["high_or"] = (client["order_rate"] >= client["order_rate"].quantile(0.75)).astype(int)
|
||||
|
||||
|
||||
X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
|
||||
X = X.copy()
|
||||
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
|
||||
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
|
||||
y = client["high_or"]
|
||||
|
||||
|
||||
num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
|
||||
cat_cols = ["gender_cd", "device_platform_cd"]
|
||||
pre = ColumnTransformer([
|
||||
("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
|
||||
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
|
||||
])
|
||||
|
||||
model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
||||
model.fit(X_train, y_train)
|
||||
proba = model.predict_proba(X_test)[:, 1]
|
||||
auc = roc_auc_score(y_test, proba)
|
||||
print("Best model AUC:", auc)
|
||||
|
||||
# Probability vs avg_imp_per_day
|
||||
grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)})
|
||||
base = client.median(numeric_only=True)
|
||||
base_gender = client["gender_cd"].mode().iat[0]
|
||||
base_device = client["device_platform_cd"].mode().iat[0]
|
||||
grid["imp_total"] = base["imp_total"]
|
||||
grid["click_total"] = base["click_total"]
|
||||
grid["age"] = base["age"]
|
||||
grid["gender_cd"] = base_gender
|
||||
grid["device_platform_cd"] = base_device
|
||||
proba_grid = model.predict_proba(grid)[:, 1]
|
||||
|
||||
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
|
||||
stats_imp = (
|
||||
client.groupby("avg_imp_per_day", as_index=False)
|
||||
@@ -138,26 +94,51 @@ stats_f["orders_smooth"] = (
|
||||
.rolling(window=w, center=True, min_periods=1)
|
||||
.mean()
|
||||
)
|
||||
# --- cost line (linear expenses) ---
|
||||
# нормируем так, чтобы масштаб был сопоставим с заказами
|
||||
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
|
||||
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
|
||||
|
||||
plt.figure(figsize=(10, 10))
|
||||
# plot
|
||||
plt.figure(figsize=(10, 8))
|
||||
|
||||
# raw (filtered) curve
|
||||
plt.plot(
|
||||
stats_f["avg_imp_per_day"], stats_f["orders_mean"],
|
||||
marker="o", linewidth=1, alpha=0.35, label="Orders (mean)"
|
||||
stats_f["avg_imp_per_day"],
|
||||
stats_f["orders_mean"],
|
||||
marker="o",
|
||||
linewidth=1,
|
||||
alpha=0.3,
|
||||
label="Среднее число заказов"
|
||||
)
|
||||
|
||||
# smoothed trend (RED)
|
||||
plt.plot(
|
||||
stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
|
||||
color="red", linewidth=2.5, label=f"Smoothed trend"
|
||||
stats_f["avg_imp_per_day"],
|
||||
stats_f["orders_smooth"],
|
||||
color="red",
|
||||
linewidth=2.5,
|
||||
label="Сглаженный тренд заказов"
|
||||
)
|
||||
|
||||
plt.xlabel("avg_imp_per_day")
|
||||
plt.ylabel("Orders (mean)")
|
||||
plt.title(f"Orders + smoothed trend vs avg_imp_per_day")
|
||||
plt.plot(
|
||||
stats_f["avg_imp_per_day"],
|
||||
stats_f["cost_line"],
|
||||
color="black",
|
||||
linestyle="--",
|
||||
linewidth=2,
|
||||
label="Линейные расходы на показы"
|
||||
)
|
||||
|
||||
plt.xlabel("Среднее число показов в день")
|
||||
plt.ylabel("Среднее число заказов")
|
||||
plt.title("Зависимость заказов от интенсивности коммуникаций")
|
||||
|
||||
plt.legend()
|
||||
plt.grid(alpha=0.3)
|
||||
plt.tight_layout()
|
||||
|
||||
plt.savefig(project_root / "spam_hypot" / "orders_vs_avg_imp_per_day_filtered_smoothed.png", dpi=150)
|
||||
print("Saved orders_vs_avg_imp_per_day_filtered_smoothed.png")
|
||||
plt.savefig(
|
||||
project_root / "spam_hypot" / "orders_vs_avg_imp_with_costs.png",
|
||||
dpi=150
|
||||
)
|
||||
|
||||
print("Saved orders_vs_avg_imp_with_costs.png")
|
||||
|
||||
BIN
spam_hypot/orders_vs_avg_imp_with_costs.png
Normal file
BIN
spam_hypot/orders_vs_avg_imp_with_costs.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 405 KiB |
BIN
spam_hypot/orders_vs_avg_imp_without_costs.png
Normal file
BIN
spam_hypot/orders_vs_avg_imp_without_costs.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 387 KiB |
BIN
spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png
Normal file
BIN
spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 360 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 256 KiB |
Reference in New Issue
Block a user