little fixes in preanalyse

This commit is contained in:
MaximOksiuta
2025-12-14 17:49:58 +03:00
parent 3ba1eb728b
commit 4f8f266c3e

View File

@@ -1650,7 +1650,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 27,
"id": "bd9437d0",
"metadata": {
"ExecuteTime": {
@@ -1660,20 +1660,10 @@
},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'dataset/ds_clean.parquet'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[26]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m SAVE_CLEANED = \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m SAVE_CLEANED:\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mdataset/ds_clean.parquet\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfastparquet\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m'\u001b[39m\u001b[33mSaved dataset/ds_clean.parquet\u001b[39m\u001b[33m'\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Cellar/jupyterlab/4.4.3_2/libexec/lib/python3.13/site-packages/pandas/util/_decorators.py:333\u001b[39m, in \u001b[36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 327\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) > num_allow_args:\n\u001b[32m 328\u001b[39m warnings.warn(\n\u001b[32m 329\u001b[39m msg.format(arguments=_format_argument_list(allow_args)),\n\u001b[32m 330\u001b[39m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[32m 331\u001b[39m stacklevel=find_stack_level(),\n\u001b[32m 332\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m333\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Cellar/jupyterlab/4.4.3_2/libexec/lib/python3.13/site-packages/pandas/core/frame.py:3124\u001b[39m, in \u001b[36mDataFrame.to_parquet\u001b[39m\u001b[34m(self, path, engine, compression, index, partition_cols, storage_options, **kwargs)\u001b[39m\n\u001b[32m 3043\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 3044\u001b[39m \u001b[33;03mWrite a DataFrame to the binary parquet format.\u001b[39;00m\n\u001b[32m 3045\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 3120\u001b[39m \u001b[33;03m>>> content = f.read()\u001b[39;00m\n\u001b[32m 3121\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 3122\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mio\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mparquet\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m to_parquet\n\u001b[32m-> \u001b[39m\u001b[32m3124\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 3125\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 3126\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3127\u001b[39m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3128\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3129\u001b[39m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3130\u001b[39m \u001b[43m \u001b[49m\u001b[43mpartition_cols\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpartition_cols\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3131\u001b[39m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3132\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3133\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Cellar/jupyterlab/4.4.3_2/libexec/lib/python3.13/site-packages/pandas/io/parquet.py:482\u001b[39m, in \u001b[36mto_parquet\u001b[39m\u001b[34m(df, path, engine, compression, index, storage_options, partition_cols, filesystem, **kwargs)\u001b[39m\n\u001b[32m 478\u001b[39m impl = get_engine(engine)\n\u001b[32m 480\u001b[39m path_or_buf: FilePath | WriteBuffer[\u001b[38;5;28mbytes\u001b[39m] = io.BytesIO() \u001b[38;5;28;01mif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m path\n\u001b[32m--> \u001b[39m\u001b[32m482\u001b[39m \u001b[43mimpl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 483\u001b[39m \u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 484\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath_or_buf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 485\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 486\u001b[39m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 487\u001b[39m \u001b[43m \u001b[49m\u001b[43mpartition_cols\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpartition_cols\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 488\u001b[39m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 489\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 490\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 491\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 493\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 494\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path_or_buf, io.BytesIO)\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Cellar/jupyterlab/4.4.3_2/libexec/lib/python3.13/site-packages/pandas/io/parquet.py:351\u001b[39m, in \u001b[36mFastParquetImpl.write\u001b[39m\u001b[34m(self, df, path, compression, index, partition_cols, storage_options, filesystem, **kwargs)\u001b[39m\n\u001b[32m 346\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 347\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mstorage_options passed with file object or non-fsspec file path\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 348\u001b[39m )\n\u001b[32m 350\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m catch_warnings(record=\u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m351\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mapi\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 352\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 353\u001b[39m \u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 354\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 355\u001b[39m \u001b[43m \u001b[49m\u001b[43mwrite_index\u001b[49m\u001b[43m=\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 356\u001b[39m \u001b[43m \u001b[49m\u001b[43mpartition_on\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpartition_cols\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 357\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 358\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Cellar/jupyterlab/4.4.3_2/libexec/lib/python3.13/site-packages/fastparquet/writer.py:1343\u001b[39m, in \u001b[36mwrite\u001b[39m\u001b[34m(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times, custom_metadata, stats)\u001b[39m\n\u001b[32m 1339\u001b[39m fmd.key_value_metadata = kvm\n\u001b[32m 1341\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m file_scheme == \u001b[33m'\u001b[39m\u001b[33msimple\u001b[39m\u001b[33m'\u001b[39m:\n\u001b[32m 1342\u001b[39m \u001b[38;5;66;03m# Case 'simple'\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1343\u001b[39m \u001b[43mwrite_simple\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfmd\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1344\u001b[39m \u001b[43m \u001b[49m\u001b[43mrow_group_offsets\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrow_group_offsets\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1345\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopen_with\u001b[49m\u001b[43m=\u001b[49m\u001b[43mopen_with\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1346\u001b[39m \u001b[43m \u001b[49m\u001b[43mhas_nulls\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mappend\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstats\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstats\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1347\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1348\u001b[39m \u001b[38;5;66;03m# Case 'hive', 'drill'\u001b[39;00m\n\u001b[32m 1349\u001b[39m write_multi(filename, data, fmd,\n\u001b[32m 1350\u001b[39m row_group_offsets=row_group_offsets,\n\u001b[32m 1351\u001b[39m compression=compression, file_scheme=file_scheme,\n\u001b[32m 1352\u001b[39m write_fmd=\u001b[38;5;28;01mTrue\u001b[39;00m, open_with=open_with,\n\u001b[32m 1353\u001b[39m mkdirs=mkdirs, partition_on=partition_on,\n\u001b[32m 1354\u001b[39m append=\u001b[38;5;28;01mFalse\u001b[39;00m, stats=stats)\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Cellar/jupyterlab/4.4.3_2/libexec/lib/python3.13/site-packages/fastparquet/writer.py:1002\u001b[39m, in \u001b[36mwrite_simple\u001b[39m\u001b[34m(fn, data, fmd, row_group_offsets, compression, open_with, has_nulls, append, stats)\u001b[39m\n\u001b[32m 1000\u001b[39m write_to_file(fn)\n\u001b[32m 1001\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1002\u001b[39m of = \u001b[43mopen_with\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1003\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m of \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m 1004\u001b[39m write_to_file(f)\n",
"\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: 'dataset/ds_clean.parquet'"
"name": "stdout",
"output_type": "stream",
"text": [
"Saved dataset/ds_clean.parquet\n"
]
}
],