583 lines
18 KiB
Python
583 lines
18 KiB
Python
import sqlite3
|
|
from pathlib import Path
|
|
import sys
|
|
from typing import Tuple
|
|
|
|
import matplotlib.pyplot as plt
|
|
from scipy.signal import savgol_filter
|
|
import pandas as pd
|
|
import seaborn as sns
|
|
from statsmodels.nonparametric.smoothers_lowess import lowess
|
|
import numpy as np
|
|
|
|
sns.set_theme(style="whitegrid")
|
|
plt.rcParams["figure.figsize"] = (8, 8)
|
|
|
|
project_root = Path(__file__).resolve().parent.parent
|
|
DB_PATH = project_root / "dataset" / "ds.sqlite"
|
|
BASE_OUT_DIR = project_root / "main_hypot"
|
|
|
|
# Константы данных
|
|
CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
|
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
|
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
|
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
|
|
|
# Константы визуализации/очистки
|
|
X_COL = "avg_imp_per_day" # x всегда фиксирован
|
|
DEFAULT_X_MAX = 18
|
|
DEFAULT_SCATTER_COLOR = "#2c7bb6"
|
|
DEFAULT_POINT_SIZE = 20
|
|
DEFAULT_ALPHA = 0.08
|
|
DEFAULT_TREND_ALPHA = 0.1
|
|
DEFAULT_TREND_FRAC = 0.3
|
|
DEFAULT_TREND_COLOR = "red"
|
|
DEFAULT_TREND_LINEWIDTH = 2.5
|
|
DEFAULT_IQR_K = 1.5
|
|
DEFAULT_Q_LOW = 0.05
|
|
DEFAULT_Q_HIGH = 0.95
|
|
DEFAULT_ALPHA_MIN = 0.04
|
|
DEFAULT_ALPHA_MAX = 0.7
|
|
DEFAULT_BINS_X = 60
|
|
DEFAULT_BINS_Y = 60
|
|
DEFAULT_Y_MIN = -0.5
|
|
DEFAULT_Y_MAX = 10
|
|
DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol
|
|
DEFAULT_ROLLING_WINDOW = 200
|
|
DEFAULT_SAVGOL_WINDOW = 501
|
|
DEFAULT_SAVGOL_POLY = 2
|
|
|
|
|
|
def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
|
|
denom = denominator.replace(0, pd.NA)
|
|
return numerator / denom
|
|
|
|
|
|
def load_client_level(db_path: Path) -> pd.DataFrame:
|
|
"""Собирает агрегаты по клиентам без зависимостей от eda_utils."""
|
|
conn = sqlite3.connect(db_path)
|
|
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
|
conn.close()
|
|
|
|
df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
|
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
|
|
|
client = (
|
|
df.groupby("id")
|
|
.agg(
|
|
imp_total=("imp_total", "sum"),
|
|
orders_amt_total=("orders_amt_total", "sum"),
|
|
contact_days=("business_dt", "nunique"),
|
|
)
|
|
.reset_index()
|
|
)
|
|
|
|
client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
|
|
print(f"Loaded {len(client)} clients with {X_COL} computed.")
|
|
return client
|
|
|
|
|
|
def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
|
|
q1, q3 = series.quantile([q_low, q_high])
|
|
iqr = q3 - q1
|
|
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
|
|
|
|
|
|
def remove_outliers(
|
|
df: pd.DataFrame,
|
|
y_col: str,
|
|
x_col: str = X_COL,
|
|
iqr_k: float = DEFAULT_IQR_K,
|
|
q_low: float = DEFAULT_Q_LOW,
|
|
q_high: float = DEFAULT_Q_HIGH,
|
|
) -> pd.DataFrame:
|
|
"""Убирает выбросы по IQR отдельно по x и y."""
|
|
x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
|
|
y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
|
|
filtered = df[
|
|
df[x_col].between(max(0, x_low), x_high)
|
|
& df[y_col].between(max(0, y_low), y_high)
|
|
].copy()
|
|
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
|
|
return filtered
|
|
|
|
|
|
def compute_density_alpha(
|
|
df: pd.DataFrame,
|
|
x_col: str,
|
|
y_col: str,
|
|
x_max: float,
|
|
*,
|
|
bins_x: int = DEFAULT_BINS_X,
|
|
bins_y: int = DEFAULT_BINS_Y,
|
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
|
y_min: float = DEFAULT_Y_MIN,
|
|
y_max_limit: float = DEFAULT_Y_MAX,
|
|
) -> np.ndarray:
|
|
"""Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
|
|
x_vals = df[x_col].to_numpy()
|
|
y_vals = df[y_col].to_numpy()
|
|
|
|
if len(x_vals) == 0:
|
|
return np.array([])
|
|
|
|
x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
|
|
y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
|
|
y_edges = np.linspace(y_min, y_upper, bins_y + 1)
|
|
|
|
x_bins = np.digitize(x_vals, x_edges) - 1
|
|
y_bins = np.digitize(y_vals, y_edges) - 1
|
|
|
|
valid = (
|
|
(x_bins >= 0) & (x_bins < bins_x) &
|
|
(y_bins >= 0) & (y_bins < bins_y)
|
|
)
|
|
counts = np.zeros((bins_x, bins_y), dtype=int)
|
|
for xb, yb in zip(x_bins[valid], y_bins[valid]):
|
|
counts[xb, yb] += 1
|
|
|
|
bin_counts = counts[
|
|
np.clip(x_bins, 0, bins_x - 1),
|
|
np.clip(y_bins, 0, bins_y - 1),
|
|
]
|
|
max_count = bin_counts.max() if len(bin_counts) else 1
|
|
if max_count == 0:
|
|
weight = np.zeros_like(bin_counts, dtype=float)
|
|
else:
|
|
weight = (bin_counts / max_count) ** np.sqrt(1.5)
|
|
weight = np.clip(weight, 0, 1)
|
|
return alpha_min + (alpha_max - alpha_min) * weight
|
|
|
|
|
|
def compute_trend(
|
|
df: pd.DataFrame,
|
|
y_col: str,
|
|
*,
|
|
x_col: str = X_COL,
|
|
method: str = DEFAULT_TREND_METHOD,
|
|
lowess_frac: float = DEFAULT_TREND_FRAC,
|
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
|
) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Возвращает (x_sorted, trend_y) по выбранному методу."""
|
|
d = df[[x_col, y_col]].dropna().sort_values(x_col)
|
|
x_vals = d[x_col].to_numpy()
|
|
y_vals = d[y_col].to_numpy()
|
|
|
|
if len(x_vals) == 0:
|
|
return np.array([]), np.array([])
|
|
|
|
m = method.lower()
|
|
if m == "lowess":
|
|
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
|
|
return trend[:, 0], trend[:, 1]
|
|
if m == "rolling":
|
|
w = max(3, rolling_window)
|
|
if w % 2 == 0:
|
|
w += 1
|
|
y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
|
|
return x_vals, y_trend
|
|
if m == "savgol":
|
|
w = max(5, savgol_window)
|
|
if w % 2 == 0:
|
|
w += 1
|
|
poly = min(savgol_poly, w - 1)
|
|
y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
|
|
return x_vals, y_trend
|
|
|
|
# fallback to lowess
|
|
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
|
|
return trend[:, 0], trend[:, 1]
|
|
|
|
|
|
def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
|
|
subset = df[df[x_col] <= x_max].copy()
|
|
print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
|
|
return subset
|
|
|
|
|
|
def plot_density_scatter(
|
|
df: pd.DataFrame,
|
|
y_col: str,
|
|
title: str,
|
|
out_path: Path,
|
|
*,
|
|
x_col: str = X_COL,
|
|
x_max: float = DEFAULT_X_MAX,
|
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
|
point_size: int = DEFAULT_POINT_SIZE,
|
|
alpha: float = DEFAULT_ALPHA,
|
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
|
bins_x: int = DEFAULT_BINS_X,
|
|
bins_y: int = DEFAULT_BINS_Y,
|
|
y_min: float = DEFAULT_Y_MIN,
|
|
y_max: float = DEFAULT_Y_MAX,
|
|
with_trend: bool = False,
|
|
trend_method: str = DEFAULT_TREND_METHOD,
|
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
|
trend_color: str = DEFAULT_TREND_COLOR,
|
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
|
return_fig: bool = False,
|
|
) -> None:
|
|
fig, ax = plt.subplots(figsize=(8, 8))
|
|
alpha_values = compute_density_alpha(
|
|
df,
|
|
x_col=x_col,
|
|
y_col=y_col,
|
|
x_max=x_max,
|
|
bins_x=bins_x,
|
|
bins_y=bins_y,
|
|
alpha_min=alpha_min,
|
|
alpha_max=alpha_max,
|
|
y_min=y_min,
|
|
y_max_limit=y_max,
|
|
)
|
|
ax.scatter(
|
|
df[x_col],
|
|
df[y_col],
|
|
color=scatter_color,
|
|
s=point_size,
|
|
alpha=alpha_values if len(alpha_values) else alpha,
|
|
linewidths=0,
|
|
)
|
|
|
|
trend_data = None
|
|
if with_trend:
|
|
tx, ty = compute_trend(
|
|
df,
|
|
y_col=y_col,
|
|
x_col=x_col,
|
|
method=trend_method,
|
|
lowess_frac=trend_frac,
|
|
rolling_window=rolling_window,
|
|
savgol_window=savgol_window,
|
|
savgol_poly=savgol_poly,
|
|
)
|
|
if len(tx):
|
|
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
|
|
ax.legend()
|
|
trend_data = (tx, ty)
|
|
|
|
ax.set_xlim(0, x_max)
|
|
ax.set_ylim(y_min, y_max)
|
|
ax.set_yticks(range(0, int(y_max) + 1, 2))
|
|
ax.set_xlabel("Среднее число показов в день")
|
|
ax.set_ylabel(y_col)
|
|
ax.set_title(title)
|
|
ax.grid(alpha=0.3)
|
|
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
fig.tight_layout()
|
|
fig.savefig(out_path, dpi=150)
|
|
if return_fig:
|
|
return fig, ax, trend_data
|
|
plt.close(fig)
|
|
print(f"Saved {out_path}")
|
|
|
|
|
|
def plot_raw_scatter(
|
|
df: pd.DataFrame,
|
|
y_col: str,
|
|
out_dir: Path,
|
|
*,
|
|
x_col: str = X_COL,
|
|
x_max: float = DEFAULT_X_MAX,
|
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
|
point_size: int = DEFAULT_POINT_SIZE,
|
|
alpha: float = DEFAULT_ALPHA,
|
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
|
bins_x: int = DEFAULT_BINS_X,
|
|
bins_y: int = DEFAULT_BINS_Y,
|
|
y_min: float = DEFAULT_Y_MIN,
|
|
y_max: float = DEFAULT_Y_MAX,
|
|
trend_method: str = DEFAULT_TREND_METHOD,
|
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
|
trend_color: str = DEFAULT_TREND_COLOR,
|
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
|
) -> None:
|
|
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
|
plot_density_scatter(
|
|
in_range,
|
|
y_col=y_col,
|
|
title=f"Облако: {y_col} vs {x_col} (все клиенты)",
|
|
out_path=out_dir / "scatter.png",
|
|
x_col=x_col,
|
|
x_max=x_max,
|
|
scatter_color=scatter_color,
|
|
point_size=point_size,
|
|
alpha=alpha,
|
|
alpha_min=alpha_min,
|
|
alpha_max=alpha_max,
|
|
bins_x=bins_x,
|
|
bins_y=bins_y,
|
|
y_min=y_min,
|
|
y_max=y_max,
|
|
trend_method=trend_method,
|
|
trend_frac=trend_frac,
|
|
trend_color=trend_color,
|
|
trend_linewidth=trend_linewidth,
|
|
rolling_window=rolling_window,
|
|
savgol_window=savgol_window,
|
|
savgol_poly=savgol_poly,
|
|
)
|
|
|
|
|
|
def plot_clean_scatter(
|
|
df: pd.DataFrame,
|
|
y_col: str,
|
|
out_dir: Path,
|
|
*,
|
|
x_col: str = X_COL,
|
|
x_max: float = DEFAULT_X_MAX,
|
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
|
point_size: int = DEFAULT_POINT_SIZE,
|
|
alpha: float = DEFAULT_ALPHA,
|
|
iqr_k: float = DEFAULT_IQR_K,
|
|
q_low: float = DEFAULT_Q_LOW,
|
|
q_high: float = DEFAULT_Q_HIGH,
|
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
|
bins_x: int = DEFAULT_BINS_X,
|
|
bins_y: int = DEFAULT_BINS_Y,
|
|
y_min: float = DEFAULT_Y_MIN,
|
|
y_max: float = DEFAULT_Y_MAX,
|
|
trend_method: str = DEFAULT_TREND_METHOD,
|
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
|
trend_color: str = DEFAULT_TREND_COLOR,
|
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
|
) -> None:
|
|
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
|
cleaned = remove_outliers(
|
|
in_range,
|
|
y_col=y_col,
|
|
x_col=x_col,
|
|
iqr_k=iqr_k,
|
|
q_low=q_low,
|
|
q_high=q_high,
|
|
)
|
|
plot_density_scatter(
|
|
cleaned,
|
|
y_col=y_col,
|
|
title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
|
|
out_path=out_dir / "scatter_clean.png",
|
|
x_col=x_col,
|
|
x_max=x_max,
|
|
scatter_color=scatter_color,
|
|
point_size=point_size,
|
|
alpha=alpha,
|
|
alpha_min=alpha_min,
|
|
alpha_max=alpha_max,
|
|
bins_x=bins_x,
|
|
bins_y=bins_y,
|
|
y_min=y_min,
|
|
y_max=y_max,
|
|
trend_method=trend_method,
|
|
trend_frac=trend_frac,
|
|
trend_color=trend_color,
|
|
trend_linewidth=trend_linewidth,
|
|
rolling_window=rolling_window,
|
|
savgol_window=savgol_window,
|
|
savgol_poly=savgol_poly,
|
|
)
|
|
|
|
|
|
def plot_clean_trend_scatter(
|
|
df: pd.DataFrame,
|
|
y_col: str,
|
|
out_dir: Path,
|
|
*,
|
|
x_col: str = X_COL,
|
|
x_max: float = DEFAULT_X_MAX,
|
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
|
point_size: int = DEFAULT_POINT_SIZE,
|
|
alpha: float = DEFAULT_TREND_ALPHA,
|
|
iqr_k: float = DEFAULT_IQR_K,
|
|
q_low: float = DEFAULT_Q_LOW,
|
|
q_high: float = DEFAULT_Q_HIGH,
|
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
|
trend_color: str = DEFAULT_TREND_COLOR,
|
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
|
bins_x: int = DEFAULT_BINS_X,
|
|
bins_y: int = DEFAULT_BINS_Y,
|
|
y_min: float = DEFAULT_Y_MIN,
|
|
y_max: float = DEFAULT_Y_MAX,
|
|
trend_method: str = DEFAULT_TREND_METHOD,
|
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
|
return_components: bool = False,
|
|
) -> None:
|
|
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
|
cleaned = remove_outliers(
|
|
in_range,
|
|
y_col=y_col,
|
|
x_col=x_col,
|
|
iqr_k=iqr_k,
|
|
q_low=q_low,
|
|
q_high=q_high,
|
|
)
|
|
fig_ax = plot_density_scatter(
|
|
cleaned,
|
|
y_col=y_col,
|
|
title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
|
|
out_path=out_dir / "scatter_trend.png",
|
|
x_col=x_col,
|
|
x_max=x_max,
|
|
scatter_color=scatter_color,
|
|
point_size=point_size,
|
|
alpha=alpha,
|
|
with_trend=True,
|
|
trend_frac=trend_frac,
|
|
trend_color=trend_color,
|
|
trend_linewidth=trend_linewidth,
|
|
alpha_min=alpha_min,
|
|
alpha_max=alpha_max,
|
|
bins_x=bins_x,
|
|
bins_y=bins_y,
|
|
y_min=y_min,
|
|
y_max=y_max,
|
|
trend_method=trend_method,
|
|
rolling_window=rolling_window,
|
|
savgol_window=savgol_window,
|
|
savgol_poly=savgol_poly,
|
|
return_fig=return_components,
|
|
)
|
|
if return_components:
|
|
fig, ax, trend_data = fig_ax
|
|
return fig, ax, cleaned, trend_data
|
|
|
|
|
|
def generate_scatter_set(
|
|
df: pd.DataFrame,
|
|
y_col: str,
|
|
*,
|
|
base_out_dir: Path = BASE_OUT_DIR,
|
|
x_col: str = X_COL,
|
|
x_max: float = DEFAULT_X_MAX,
|
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
|
point_size: int = DEFAULT_POINT_SIZE,
|
|
alpha: float = DEFAULT_ALPHA,
|
|
trend_alpha: float = DEFAULT_TREND_ALPHA,
|
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
|
trend_color: str = DEFAULT_TREND_COLOR,
|
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
|
iqr_k: float = DEFAULT_IQR_K,
|
|
q_low: float = DEFAULT_Q_LOW,
|
|
q_high: float = DEFAULT_Q_HIGH,
|
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
|
bins_x: int = DEFAULT_BINS_X,
|
|
bins_y: int = DEFAULT_BINS_Y,
|
|
y_min: float = DEFAULT_Y_MIN,
|
|
y_max: float = DEFAULT_Y_MAX,
|
|
trend_method: str = DEFAULT_TREND_METHOD,
|
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
|
) -> None:
|
|
"""Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
|
|
out_dir = base_out_dir / str(y_col).replace("/", "_")
|
|
plot_raw_scatter(
|
|
df,
|
|
y_col=y_col,
|
|
out_dir=out_dir,
|
|
x_col=x_col,
|
|
x_max=x_max,
|
|
scatter_color=scatter_color,
|
|
point_size=point_size,
|
|
alpha=alpha,
|
|
alpha_min=alpha_min,
|
|
alpha_max=alpha_max,
|
|
bins_x=bins_x,
|
|
bins_y=bins_y,
|
|
y_min=y_min,
|
|
y_max=y_max,
|
|
trend_method=trend_method,
|
|
trend_frac=trend_frac,
|
|
trend_color=trend_color,
|
|
trend_linewidth=trend_linewidth,
|
|
rolling_window=rolling_window,
|
|
savgol_window=savgol_window,
|
|
savgol_poly=savgol_poly,
|
|
)
|
|
plot_clean_scatter(
|
|
df,
|
|
y_col=y_col,
|
|
out_dir=out_dir,
|
|
x_col=x_col,
|
|
x_max=x_max,
|
|
scatter_color=scatter_color,
|
|
point_size=point_size,
|
|
alpha=alpha,
|
|
iqr_k=iqr_k,
|
|
q_low=q_low,
|
|
q_high=q_high,
|
|
alpha_min=alpha_min,
|
|
alpha_max=alpha_max,
|
|
bins_x=bins_x,
|
|
bins_y=bins_y,
|
|
y_min=y_min,
|
|
y_max=y_max,
|
|
trend_method=trend_method,
|
|
trend_frac=trend_frac,
|
|
trend_color=trend_color,
|
|
trend_linewidth=trend_linewidth,
|
|
rolling_window=rolling_window,
|
|
savgol_window=savgol_window,
|
|
savgol_poly=savgol_poly,
|
|
)
|
|
plot_clean_trend_scatter(
|
|
df,
|
|
y_col=y_col,
|
|
out_dir=out_dir,
|
|
x_col=x_col,
|
|
x_max=x_max,
|
|
scatter_color=scatter_color,
|
|
point_size=point_size,
|
|
alpha=trend_alpha,
|
|
iqr_k=iqr_k,
|
|
q_low=q_low,
|
|
q_high=q_high,
|
|
trend_frac=trend_frac,
|
|
trend_color=trend_color,
|
|
trend_linewidth=trend_linewidth,
|
|
alpha_min=alpha_min,
|
|
alpha_max=alpha_max,
|
|
bins_x=bins_x,
|
|
bins_y=bins_y,
|
|
y_min=y_min,
|
|
y_max=y_max,
|
|
trend_method=trend_method,
|
|
rolling_window=rolling_window,
|
|
savgol_window=savgol_window,
|
|
savgol_poly=savgol_poly,
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
client = load_client_level(DB_PATH)
|
|
zero_orders = (client["orders_amt_total"] == 0).sum()
|
|
non_zero = len(client) - zero_orders
|
|
if len(client):
|
|
print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
|
|
generate_scatter_set(client, y_col="orders_amt_total")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|