add full script
This commit is contained in:
@@ -5,13 +5,6 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
||||||
from sklearn.compose import ColumnTransformer
|
|
||||||
from sklearn.pipeline import Pipeline
|
|
||||||
from sklearn.impute import SimpleImputer
|
|
||||||
from sklearn.ensemble import GradientBoostingClassifier
|
|
||||||
from sklearn.metrics import roc_auc_score
|
|
||||||
|
|
||||||
sns.set_theme(style="whitegrid")
|
sns.set_theme(style="whitegrid")
|
||||||
plt.rcParams["figure.figsize"] = (10, 5)
|
plt.rcParams["figure.figsize"] = (10, 5)
|
||||||
@@ -54,43 +47,6 @@ client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_t
|
|||||||
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
|
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
|
||||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
||||||
|
|
||||||
# таргет: высокий orders/impressions
|
|
||||||
client["high_or"] = (client["order_rate"] >= client["order_rate"].quantile(0.75)).astype(int)
|
|
||||||
|
|
||||||
|
|
||||||
X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
|
|
||||||
X = X.copy()
|
|
||||||
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
|
|
||||||
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
|
|
||||||
y = client["high_or"]
|
|
||||||
|
|
||||||
|
|
||||||
num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
|
|
||||||
cat_cols = ["gender_cd", "device_platform_cd"]
|
|
||||||
pre = ColumnTransformer([
|
|
||||||
("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
|
|
||||||
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
|
|
||||||
])
|
|
||||||
|
|
||||||
model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
|
||||||
model.fit(X_train, y_train)
|
|
||||||
proba = model.predict_proba(X_test)[:, 1]
|
|
||||||
auc = roc_auc_score(y_test, proba)
|
|
||||||
print("Best model AUC:", auc)
|
|
||||||
|
|
||||||
# Probability vs avg_imp_per_day
|
|
||||||
grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)})
|
|
||||||
base = client.median(numeric_only=True)
|
|
||||||
base_gender = client["gender_cd"].mode().iat[0]
|
|
||||||
base_device = client["device_platform_cd"].mode().iat[0]
|
|
||||||
grid["imp_total"] = base["imp_total"]
|
|
||||||
grid["click_total"] = base["click_total"]
|
|
||||||
grid["age"] = base["age"]
|
|
||||||
grid["gender_cd"] = base_gender
|
|
||||||
grid["device_platform_cd"] = base_device
|
|
||||||
proba_grid = model.predict_proba(grid)[:, 1]
|
|
||||||
|
|
||||||
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
|
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
|
||||||
stats_imp = (
|
stats_imp = (
|
||||||
client.groupby("avg_imp_per_day", as_index=False)
|
client.groupby("avg_imp_per_day", as_index=False)
|
||||||
@@ -138,26 +94,51 @@ stats_f["orders_smooth"] = (
|
|||||||
.rolling(window=w, center=True, min_periods=1)
|
.rolling(window=w, center=True, min_periods=1)
|
||||||
.mean()
|
.mean()
|
||||||
)
|
)
|
||||||
|
# --- cost line (linear expenses) ---
|
||||||
|
# нормируем так, чтобы масштаб был сопоставим с заказами
|
||||||
|
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
|
||||||
|
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
|
||||||
|
|
||||||
plt.figure(figsize=(10, 10))
|
# plot
|
||||||
|
plt.figure(figsize=(10, 8))
|
||||||
|
|
||||||
# raw (filtered) curve
|
|
||||||
plt.plot(
|
plt.plot(
|
||||||
stats_f["avg_imp_per_day"], stats_f["orders_mean"],
|
stats_f["avg_imp_per_day"],
|
||||||
marker="o", linewidth=1, alpha=0.35, label="Orders (mean)"
|
stats_f["orders_mean"],
|
||||||
|
marker="o",
|
||||||
|
linewidth=1,
|
||||||
|
alpha=0.3,
|
||||||
|
label="Среднее число заказов"
|
||||||
)
|
)
|
||||||
|
|
||||||
# smoothed trend (RED)
|
|
||||||
plt.plot(
|
plt.plot(
|
||||||
stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
|
stats_f["avg_imp_per_day"],
|
||||||
color="red", linewidth=2.5, label=f"Smoothed trend"
|
stats_f["orders_smooth"],
|
||||||
|
color="red",
|
||||||
|
linewidth=2.5,
|
||||||
|
label="Сглаженный тренд заказов"
|
||||||
)
|
)
|
||||||
|
|
||||||
plt.xlabel("avg_imp_per_day")
|
plt.plot(
|
||||||
plt.ylabel("Orders (mean)")
|
stats_f["avg_imp_per_day"],
|
||||||
plt.title(f"Orders + smoothed trend vs avg_imp_per_day")
|
stats_f["cost_line"],
|
||||||
|
color="black",
|
||||||
|
linestyle="--",
|
||||||
|
linewidth=2,
|
||||||
|
label="Линейные расходы на показы"
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.xlabel("Среднее число показов в день")
|
||||||
|
plt.ylabel("Среднее число заказов")
|
||||||
|
plt.title("Зависимость заказов от интенсивности коммуникаций")
|
||||||
|
|
||||||
plt.legend()
|
plt.legend()
|
||||||
|
plt.grid(alpha=0.3)
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
|
|
||||||
plt.savefig(project_root / "spam_hypot" / "orders_vs_avg_imp_per_day_filtered_smoothed.png", dpi=150)
|
plt.savefig(
|
||||||
print("Saved orders_vs_avg_imp_per_day_filtered_smoothed.png")
|
project_root / "spam_hypot" / "orders_vs_avg_imp_with_costs.png",
|
||||||
|
dpi=150
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Saved orders_vs_avg_imp_with_costs.png")
|
||||||
|
|||||||
BIN
spam_hypot/orders_vs_avg_imp_with_costs.png
Normal file
BIN
spam_hypot/orders_vs_avg_imp_with_costs.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 405 KiB |
BIN
spam_hypot/orders_vs_avg_imp_without_costs.png
Normal file
BIN
spam_hypot/orders_vs_avg_imp_without_costs.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 387 KiB |
BIN
spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png
Normal file
BIN
spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 360 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 256 KiB |
Reference in New Issue
Block a user