added migrations

2025-12-12 20:34:40 +03:00
parent d1d016b2ed
commit dd8feb488e
5 changed files with 200 additions and 1 deletions
--- a/migrations/0001_csv_to_sqlite.py
+++ b/migrations/0001_csv_to_sqlite.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+
+import pandas as pd
+
+MIGRATION_ID = "0001_csv_to_sqlite"
+DESCRIPTION = "Convert dataset/ds.csv into dataset/ds.sqlite table communications"
+
+CHUNK_SIZE = 10000
+TABLE_NAME = "communications"
+
+
+def run(context) -> None:
+    dataset_dir = Path(getattr(context, "dataset_dir", Path.cwd()))
+    csv_path = getattr(context, "csv_path", dataset_dir / "ds.csv")
+    sqlite_path = getattr(context, "sqlite_path", dataset_dir / "ds.sqlite")
+    force = bool(getattr(context, "force", False))
+
+    if not csv_path.exists():
+        raise FileNotFoundError(f"CSV file not found: {csv_path}")
+
+    if sqlite_path.exists():
+        if force:
+            sqlite_path.unlink()
+        else:
+            print(f"SQLite database already exists at {sqlite_path}, skipping migration")
+            return
+
+    sqlite_path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(sqlite_path)
+    try:
+        first_chunk = True
+        for chunk in pd.read_csv(csv_path, chunksize=CHUNK_SIZE):
+            chunk["business_dt"] = pd.to_datetime(chunk["business_dt"]).dt.strftime("%Y-%m-%d")
+            if_exists = "replace" if first_chunk else "append"
+            chunk.to_sql(TABLE_NAME, conn, if_exists=if_exists, index=False)
+            first_chunk = False
+
+        if first_chunk:
+            raise RuntimeError("Source CSV is empty, no rows were written to SQLite")
+
+        conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_id ON {TABLE_NAME}(id)")
+        conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_business_dt ON {TABLE_NAME}(business_dt)")
+        conn.commit()
+    finally:
+        conn.close()