from __future__ import annotations """Первая миграция: переносит сырое CSV в SQLite и создаёт индексы.""" import sqlite3 from pathlib import Path import pandas as pd MIGRATION_ID = "0001_csv_to_sqlite" DESCRIPTION = "Convert dataset/ds.csv into dataset/ds.sqlite table communications" CHUNK_SIZE = 10000 TABLE_NAME = "communications" def run(context) -> None: # Определяем пути и режим выполнения (force позволяет пересоздать БД) dataset_dir = Path(getattr(context, "dataset_dir", Path.cwd())) csv_path = getattr(context, "csv_path", dataset_dir / "ds.csv") sqlite_path = getattr(context, "sqlite_path", dataset_dir / "ds.sqlite") force = bool(getattr(context, "force", False)) if not csv_path.exists(): raise FileNotFoundError(f"CSV file not found: {csv_path}") if sqlite_path.exists(): if force: sqlite_path.unlink() else: print(f"SQLite database already exists at {sqlite_path}, skipping migration") return sqlite_path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(sqlite_path) try: # Читаем CSV чанками, нормализуем дату и пишем в таблицу communications first_chunk = True for chunk in pd.read_csv(csv_path, chunksize=CHUNK_SIZE): chunk["business_dt"] = pd.to_datetime(chunk["business_dt"]).dt.strftime("%Y-%m-%d") if_exists = "replace" if first_chunk else "append" chunk.to_sql(TABLE_NAME, conn, if_exists=if_exists, index=False) first_chunk = False if first_chunk: raise RuntimeError("Source CSV is empty, no rows were written to SQLite") # Добавляем индексы, чтобы последующие выборки работали быстрее conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_id ON {TABLE_NAME}(id)") conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_business_dt ON {TABLE_NAME}(business_dt)") conn.commit() finally: conn.close()