add full script

This commit is contained in:
dan
2025-12-13 03:10:17 +03:00
parent b274076ee7
commit 76ded21d8f
5 changed files with 37 additions and 56 deletions

View File

@@ -5,13 +5,6 @@ import numpy as np
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
sns.set_theme(style="whitegrid") sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5) plt.rcParams["figure.figsize"] = (10, 5)
@@ -54,43 +47,6 @@ client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_t
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"]) client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# таргет: высокий orders/impressions
client["high_or"] = (client["order_rate"] >= client["order_rate"].quantile(0.75)).astype(int)
X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
X = X.copy()
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
y = client["high_or"]
num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
cat_cols = ["gender_cd", "device_platform_cd"]
pre = ColumnTransformer([
("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])
model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
print("Best model AUC:", auc)
# Probability vs avg_imp_per_day
grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)})
base = client.median(numeric_only=True)
base_gender = client["gender_cd"].mode().iat[0]
base_device = client["device_platform_cd"].mode().iat[0]
grid["imp_total"] = base["imp_total"]
grid["click_total"] = base["click_total"]
grid["age"] = base["age"]
grid["gender_cd"] = base_gender
grid["device_platform_cd"] = base_device
proba_grid = model.predict_proba(grid)[:, 1]
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending # Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
stats_imp = ( stats_imp = (
client.groupby("avg_imp_per_day", as_index=False) client.groupby("avg_imp_per_day", as_index=False)
@@ -138,26 +94,51 @@ stats_f["orders_smooth"] = (
.rolling(window=w, center=True, min_periods=1) .rolling(window=w, center=True, min_periods=1)
.mean() .mean()
) )
# --- cost line (linear expenses) ---
# нормируем так, чтобы масштаб был сопоставим с заказами
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
plt.figure(figsize=(10, 10)) # plot
plt.figure(figsize=(10, 8))
# raw (filtered) curve
plt.plot( plt.plot(
stats_f["avg_imp_per_day"], stats_f["orders_mean"], stats_f["avg_imp_per_day"],
marker="o", linewidth=1, alpha=0.35, label="Orders (mean)" stats_f["orders_mean"],
marker="o",
linewidth=1,
alpha=0.3,
label="Среднее число заказов"
) )
# smoothed trend (RED)
plt.plot( plt.plot(
stats_f["avg_imp_per_day"], stats_f["orders_smooth"], stats_f["avg_imp_per_day"],
color="red", linewidth=2.5, label=f"Smoothed trend" stats_f["orders_smooth"],
color="red",
linewidth=2.5,
label="Сглаженный тренд заказов"
) )
plt.xlabel("avg_imp_per_day") plt.plot(
plt.ylabel("Orders (mean)") stats_f["avg_imp_per_day"],
plt.title(f"Orders + smoothed trend vs avg_imp_per_day") stats_f["cost_line"],
color="black",
linestyle="--",
linewidth=2,
label="Линейные расходы на показы"
)
plt.xlabel("Среднее число показов в день")
plt.ylabel("Среднее число заказов")
plt.title("Зависимость заказов от интенсивности коммуникаций")
plt.legend() plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout() plt.tight_layout()
plt.savefig(project_root / "spam_hypot" / "orders_vs_avg_imp_per_day_filtered_smoothed.png", dpi=150) plt.savefig(
print("Saved orders_vs_avg_imp_per_day_filtered_smoothed.png") project_root / "spam_hypot" / "orders_vs_avg_imp_with_costs.png",
dpi=150
)
print("Saved orders_vs_avg_imp_with_costs.png")

Binary file not shown.

After

Width:  |  Height:  |  Size: 405 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 387 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 360 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 256 KiB