added migrations
This commit is contained in:
48
migrations/0001_csv_to_sqlite.py
Normal file
48
migrations/0001_csv_to_sqlite.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
MIGRATION_ID = "0001_csv_to_sqlite"
|
||||
DESCRIPTION = "Convert dataset/ds.csv into dataset/ds.sqlite table communications"
|
||||
|
||||
CHUNK_SIZE = 10000
|
||||
TABLE_NAME = "communications"
|
||||
|
||||
|
||||
def run(context) -> None:
|
||||
dataset_dir = Path(getattr(context, "dataset_dir", Path.cwd()))
|
||||
csv_path = getattr(context, "csv_path", dataset_dir / "ds.csv")
|
||||
sqlite_path = getattr(context, "sqlite_path", dataset_dir / "ds.sqlite")
|
||||
force = bool(getattr(context, "force", False))
|
||||
|
||||
if not csv_path.exists():
|
||||
raise FileNotFoundError(f"CSV file not found: {csv_path}")
|
||||
|
||||
if sqlite_path.exists():
|
||||
if force:
|
||||
sqlite_path.unlink()
|
||||
else:
|
||||
print(f"SQLite database already exists at {sqlite_path}, skipping migration")
|
||||
return
|
||||
|
||||
sqlite_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = sqlite3.connect(sqlite_path)
|
||||
try:
|
||||
first_chunk = True
|
||||
for chunk in pd.read_csv(csv_path, chunksize=CHUNK_SIZE):
|
||||
chunk["business_dt"] = pd.to_datetime(chunk["business_dt"]).dt.strftime("%Y-%m-%d")
|
||||
if_exists = "replace" if first_chunk else "append"
|
||||
chunk.to_sql(TABLE_NAME, conn, if_exists=if_exists, index=False)
|
||||
first_chunk = False
|
||||
|
||||
if first_chunk:
|
||||
raise RuntimeError("Source CSV is empty, no rows were written to SQLite")
|
||||
|
||||
conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_id ON {TABLE_NAME}(id)")
|
||||
conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_business_dt ON {TABLE_NAME}(business_dt)")
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user