{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Exploring alternative cache storage using duckdb and parquet\n", "\n", "https://claude.ai/chat/e49491f7-8b18-4fb7-b301-5c9997746079\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TTOOLS: Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env\n", "Start loading data... 1730370862.4833238\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "829f7f3d58a74f1fbfdcfc202c2aaf84", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "fetched parquet -11.310973167419434\n", "Loaded 1836460 rows\n" ] } ], "source": [ "from ttools.tradecache import TradeCache\n", "from ttools.utils import zoneNY\n", "from pathlib import Path\n", "from datetime import datetime\n", "import logging\n", "import duckdb\n", "\n", "logging.basicConfig(\n", " level=logging.INFO, # Set the minimum level (DEBUG, INFO, WARNING, ERROR, CRITICAL)\n", " format='%(levelname)s: %(message)s' # Simple format showing level and message\n", ")\n", "\n", "cache = TradeCache(\n", " base_path=Path(\"./trade_cache\"),\n", " max_workers=4, # Adjust based on your CPU\n", " cleanup_after_days=7\n", ")\n", "\n", "# Load data\n", "df = cache.load_range(\n", " symbol=\"BAC\",\n", " start_date=zoneNY.localize(datetime(2024, 10, 14, 9, 30)),\n", " end_date=zoneNY.localize(datetime(2024, 10, 20, 16, 0)),\n", " #columns=['open', 'high', 'low', 'close', 'volume']\n", ")\n", "\n", "print(f\"Loaded {len(df)} rows\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DuckDB Schema:\n", " column_name column_type null key default extra\n", "0 x VARCHAR YES None None None\n", "1 p DOUBLE YES None None None\n", "2 s BIGINT YES None None None\n", "3 i BIGINT YES None None None\n", "4 c VARCHAR[] YES None None None\n", "5 z VARCHAR YES None None None\n", "6 t TIMESTAMP WITH TIME ZONE YES None None None\n", "\n", "Sample Data:\n", " x p s i c z \\\n", "0 T 41.870 27 62879146994030 [ , F, T, I] A \n", "1 D 41.965 1 71675241580848 [ , I] A \n", "2 D 41.965 1 71675241644625 [ , I] A \n", "3 D 41.850 1 71675241772360 [ , I] A \n", "4 N 41.960 416188 52983525028174 [ , O] A \n", "\n", " t \n", "0 2024-10-14 15:30:00.006480+02:00 \n", "1 2024-10-14 15:30:00.395802+02:00 \n", "2 2024-10-14 15:30:00.484008+02:00 \n", "3 2024-10-14 15:30:00.610005+02:00 \n", "4 2024-10-14 15:30:01.041599+02:00 \n", "\n", "Pandas Info:\n" ] }, { "ename": "NameError", "evalue": "name 'pd' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[4], line 25\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39minfo())\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# Let's check the schema first\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[43mcheck_parquet_schema\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[4], line 21\u001b[0m, in \u001b[0;36mcheck_parquet_schema\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# Method 3: Using pandas\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mPandas Info:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 21\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mread_parquet(sample_file)\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39minfo())\n", "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined" ] } ], "source": [ "import duckdb\n", "\n", "def check_parquet_schema():\n", " # Read one file and print its structure\n", " sample_file = Path(\"./trade_cache\")/\"temp/BAC_20241014.parquet\"\n", " \n", " # Method 1: Using DuckDB describe\n", " print(\"DuckDB Schema:\")\n", " print(duckdb.sql(f\"DESCRIBE SELECT * FROM read_parquet('{sample_file}')\").df())\n", " \n", " # Method 2: Just look at the data\n", " print(\"\\nSample Data:\")\n", " print(duckdb.sql(f\"\"\"\n", " SELECT *\n", " FROM read_parquet('{sample_file}')\n", " LIMIT 5\n", " \"\"\").df())\n", " \n", " # Method 3: Using pandas\n", " print(\"\\nPandas Info:\")\n", " df = pd.read_parquet(sample_file)\n", " print(df.info())\n", "\n", "# Let's check the schema first\n", "check_parquet_schema()" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }