diff --git a/spam_hypot/best_model_and_plots.py b/spam_hypot/best_model_and_plots.py index 2d43014..5f5ac00 100644 --- a/spam_hypot/best_model_and_plots.py +++ b/spam_hypot/best_model_and_plots.py @@ -5,13 +5,6 @@ import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler, OneHotEncoder -from sklearn.compose import ColumnTransformer -from sklearn.pipeline import Pipeline -from sklearn.impute import SimpleImputer -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.metrics import roc_auc_score sns.set_theme(style="whitegrid") plt.rcParams["figure.figsize"] = (10, 5) @@ -54,43 +47,6 @@ client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_t client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"]) -# таргет: высокий orders/impressions -client["high_or"] = (client["order_rate"] >= client["order_rate"].quantile(0.75)).astype(int) - - -X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]] -X = X.copy() -X["gender_cd"] = eda.normalize_gender(X["gender_cd"]) -X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"]) -y = client["high_or"] - - -num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"] -cat_cols = ["gender_cd", "device_platform_cd"] -pre = ColumnTransformer([ - ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols), - ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols), -]) - -model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))]) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) -model.fit(X_train, y_train) -proba = model.predict_proba(X_test)[:, 1] -auc = roc_auc_score(y_test, proba) -print("Best model AUC:", auc) - -# Probability vs avg_imp_per_day -grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)}) -base = client.median(numeric_only=True) -base_gender = client["gender_cd"].mode().iat[0] -base_device = client["device_platform_cd"].mode().iat[0] -grid["imp_total"] = base["imp_total"] -grid["click_total"] = base["click_total"] -grid["age"] = base["age"] -grid["gender_cd"] = base_gender -grid["device_platform_cd"] = base_device -proba_grid = model.predict_proba(grid)[:, 1] - # Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending stats_imp = ( client.groupby("avg_imp_per_day", as_index=False) @@ -138,26 +94,51 @@ stats_f["orders_smooth"] = ( .rolling(window=w, center=True, min_periods=1) .mean() ) +# --- cost line (linear expenses) --- +# нормируем так, чтобы масштаб был сопоставим с заказами +c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max() +stats_f["cost_line"] = c * stats_f["avg_imp_per_day"] -plt.figure(figsize=(10, 10)) +# plot +plt.figure(figsize=(10, 8)) -# raw (filtered) curve plt.plot( - stats_f["avg_imp_per_day"], stats_f["orders_mean"], - marker="o", linewidth=1, alpha=0.35, label="Orders (mean)" + stats_f["avg_imp_per_day"], + stats_f["orders_mean"], + marker="o", + linewidth=1, + alpha=0.3, + label="Среднее число заказов" ) -# smoothed trend (RED) plt.plot( - stats_f["avg_imp_per_day"], stats_f["orders_smooth"], - color="red", linewidth=2.5, label=f"Smoothed trend" + stats_f["avg_imp_per_day"], + stats_f["orders_smooth"], + color="red", + linewidth=2.5, + label="Сглаженный тренд заказов" ) -plt.xlabel("avg_imp_per_day") -plt.ylabel("Orders (mean)") -plt.title(f"Orders + smoothed trend vs avg_imp_per_day") +plt.plot( + stats_f["avg_imp_per_day"], + stats_f["cost_line"], + color="black", + linestyle="--", + linewidth=2, + label="Линейные расходы на показы" +) + +plt.xlabel("Среднее число показов в день") +plt.ylabel("Среднее число заказов") +plt.title("Зависимость заказов от интенсивности коммуникаций") + plt.legend() +plt.grid(alpha=0.3) plt.tight_layout() -plt.savefig(project_root / "spam_hypot" / "orders_vs_avg_imp_per_day_filtered_smoothed.png", dpi=150) -print("Saved orders_vs_avg_imp_per_day_filtered_smoothed.png") +plt.savefig( + project_root / "spam_hypot" / "orders_vs_avg_imp_with_costs.png", + dpi=150 +) + +print("Saved orders_vs_avg_imp_with_costs.png") diff --git a/spam_hypot/orders_vs_avg_imp_with_costs.png b/spam_hypot/orders_vs_avg_imp_with_costs.png new file mode 100644 index 0000000..1137f15 Binary files /dev/null and b/spam_hypot/orders_vs_avg_imp_with_costs.png differ diff --git a/spam_hypot/orders_vs_avg_imp_without_costs.png b/spam_hypot/orders_vs_avg_imp_without_costs.png new file mode 100644 index 0000000..e3991f6 Binary files /dev/null and b/spam_hypot/orders_vs_avg_imp_without_costs.png differ diff --git a/spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png b/spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png new file mode 100644 index 0000000..223518e Binary files /dev/null and b/spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png differ diff --git a/spam_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png b/spam_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png new file mode 100644 index 0000000..7c224a2 Binary files /dev/null and b/spam_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png differ