179 lines
6.4 KiB
Plaintext
179 lines
6.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Exploring alternative cache storage using duckdb and parquet\n",
|
|
"\n",
|
|
"https://claude.ai/chat/e49491f7-8b18-4fb7-b301-5c9997746079\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"TTOOLS: Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env\n",
|
|
"Start loading data... 1730370862.4833238\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "829f7f3d58a74f1fbfdcfc202c2aaf84",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"fetched parquet -11.310973167419434\n",
|
|
"Loaded 1836460 rows\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from ttools.tradecache import TradeCache\n",
|
|
"from ttools.utils import zoneNY\n",
|
|
"from pathlib import Path\n",
|
|
"from datetime import datetime\n",
|
|
"import logging\n",
|
|
"import duckdb\n",
|
|
"\n",
|
|
"logging.basicConfig(\n",
|
|
" level=logging.INFO, # Set the minimum level (DEBUG, INFO, WARNING, ERROR, CRITICAL)\n",
|
|
" format='%(levelname)s: %(message)s' # Simple format showing level and message\n",
|
|
")\n",
|
|
"\n",
|
|
"cache = TradeCache(\n",
|
|
" base_path=Path(\"./trade_cache\"),\n",
|
|
" max_workers=4, # Adjust based on your CPU\n",
|
|
" cleanup_after_days=7\n",
|
|
")\n",
|
|
"\n",
|
|
"# Load data\n",
|
|
"df = cache.load_range(\n",
|
|
" symbol=\"BAC\",\n",
|
|
" start_date=zoneNY.localize(datetime(2024, 10, 14, 9, 30)),\n",
|
|
" end_date=zoneNY.localize(datetime(2024, 10, 20, 16, 0)),\n",
|
|
" #columns=['open', 'high', 'low', 'close', 'volume']\n",
|
|
")\n",
|
|
"\n",
|
|
"print(f\"Loaded {len(df)} rows\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"DuckDB Schema:\n",
|
|
" column_name column_type null key default extra\n",
|
|
"0 x VARCHAR YES None None None\n",
|
|
"1 p DOUBLE YES None None None\n",
|
|
"2 s BIGINT YES None None None\n",
|
|
"3 i BIGINT YES None None None\n",
|
|
"4 c VARCHAR[] YES None None None\n",
|
|
"5 z VARCHAR YES None None None\n",
|
|
"6 t TIMESTAMP WITH TIME ZONE YES None None None\n",
|
|
"\n",
|
|
"Sample Data:\n",
|
|
" x p s i c z \\\n",
|
|
"0 T 41.870 27 62879146994030 [ , F, T, I] A \n",
|
|
"1 D 41.965 1 71675241580848 [ , I] A \n",
|
|
"2 D 41.965 1 71675241644625 [ , I] A \n",
|
|
"3 D 41.850 1 71675241772360 [ , I] A \n",
|
|
"4 N 41.960 416188 52983525028174 [ , O] A \n",
|
|
"\n",
|
|
" t \n",
|
|
"0 2024-10-14 15:30:00.006480+02:00 \n",
|
|
"1 2024-10-14 15:30:00.395802+02:00 \n",
|
|
"2 2024-10-14 15:30:00.484008+02:00 \n",
|
|
"3 2024-10-14 15:30:00.610005+02:00 \n",
|
|
"4 2024-10-14 15:30:01.041599+02:00 \n",
|
|
"\n",
|
|
"Pandas Info:\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'pd' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[4], line 25\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39minfo())\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# Let's check the schema first\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[43mcheck_parquet_schema\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
|
"Cell \u001b[0;32mIn[4], line 21\u001b[0m, in \u001b[0;36mcheck_parquet_schema\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# Method 3: Using pandas\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mPandas Info:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 21\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mread_parquet(sample_file)\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39minfo())\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import duckdb\n",
|
|
"\n",
|
|
"def check_parquet_schema():\n",
|
|
" # Read one file and print its structure\n",
|
|
" sample_file = Path(\"./trade_cache\")/\"temp/BAC_20241014.parquet\"\n",
|
|
" \n",
|
|
" # Method 1: Using DuckDB describe\n",
|
|
" print(\"DuckDB Schema:\")\n",
|
|
" print(duckdb.sql(f\"DESCRIBE SELECT * FROM read_parquet('{sample_file}')\").df())\n",
|
|
" \n",
|
|
" # Method 2: Just look at the data\n",
|
|
" print(\"\\nSample Data:\")\n",
|
|
" print(duckdb.sql(f\"\"\"\n",
|
|
" SELECT *\n",
|
|
" FROM read_parquet('{sample_file}')\n",
|
|
" LIMIT 5\n",
|
|
" \"\"\").df())\n",
|
|
" \n",
|
|
" # Method 3: Using pandas\n",
|
|
" print(\"\\nPandas Info:\")\n",
|
|
" df = pd.read_parquet(sample_file)\n",
|
|
" print(df.info())\n",
|
|
"\n",
|
|
"# Let's check the schema first\n",
|
|
"check_parquet_schema()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|