ttools/tests/WIP-tradecache_duckdb_approach/hive_cache.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exploring alternative cache storage using duckdb and parquet\n",
    "\n",
    "https://claude.ai/chat/e49491f7-8b18-4fb7-b301-5c9997746079\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TTOOLS: Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env\n",
      "Start loading data... 1730370862.4833238\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "829f7f3d58a74f1fbfdcfc202c2aaf84",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fetched parquet -11.310973167419434\n",
      "Loaded 1836460 rows\n"
     ]
    }
   ],
   "source": [
    "from ttools.tradecache import TradeCache\n",
    "from ttools.utils import zoneNY\n",
    "from pathlib import Path\n",
    "from datetime import datetime\n",
    "import logging\n",
    "import duckdb\n",
    "\n",
    "logging.basicConfig(\n",
    "    level=logging.INFO,  # Set the minimum level (DEBUG, INFO, WARNING, ERROR, CRITICAL)\n",
    "    format='%(levelname)s: %(message)s'  # Simple format showing level and message\n",
    ")\n",
    "\n",
    "cache = TradeCache(\n",
    "    base_path=Path(\"./trade_cache\"),\n",
    "    max_workers=4,  # Adjust based on your CPU\n",
    "    cleanup_after_days=7\n",
    ")\n",
    "\n",
    "# Load data\n",
    "df = cache.load_range(\n",
    "    symbol=\"BAC\",\n",
    "    start_date=zoneNY.localize(datetime(2024, 10, 14, 9, 30)),\n",
    "    end_date=zoneNY.localize(datetime(2024, 10, 20, 16, 0)),\n",
    "    #columns=['open', 'high', 'low', 'close', 'volume']\n",
    ")\n",
    "\n",
    "print(f\"Loaded {len(df)} rows\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DuckDB Schema:\n",
      "  column_name               column_type null   key default extra\n",
      "0           x                   VARCHAR  YES  None    None  None\n",
      "1           p                    DOUBLE  YES  None    None  None\n",
      "2           s                    BIGINT  YES  None    None  None\n",
      "3           i                    BIGINT  YES  None    None  None\n",
      "4           c                 VARCHAR[]  YES  None    None  None\n",
      "5           z                   VARCHAR  YES  None    None  None\n",
      "6           t  TIMESTAMP WITH TIME ZONE  YES  None    None  None\n",
      "\n",
      "Sample Data:\n",
      "   x       p       s               i             c  z  \\\n",
      "0  T  41.870      27  62879146994030  [ , F, T, I]  A   \n",
      "1  D  41.965       1  71675241580848        [ , I]  A   \n",
      "2  D  41.965       1  71675241644625        [ , I]  A   \n",
      "3  D  41.850       1  71675241772360        [ , I]  A   \n",
      "4  N  41.960  416188  52983525028174        [ , O]  A   \n",
      "\n",
      "                                 t  \n",
      "0 2024-10-14 15:30:00.006480+02:00  \n",
      "1 2024-10-14 15:30:00.395802+02:00  \n",
      "2 2024-10-14 15:30:00.484008+02:00  \n",
      "3 2024-10-14 15:30:00.610005+02:00  \n",
      "4 2024-10-14 15:30:01.041599+02:00  \n",
      "\n",
      "Pandas Info:\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'pd' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[4], line 25\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39minfo())\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# Let's check the schema first\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[43mcheck_parquet_schema\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[4], line 21\u001b[0m, in \u001b[0;36mcheck_parquet_schema\u001b[0;34m()\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;66;03m# Method 3: Using pandas\u001b[39;00m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mPandas Info:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 21\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mread_parquet(sample_file)\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39minfo())\n",
      "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
     ]
    }
   ],
   "source": [
    "import duckdb\n",
    "\n",
    "def check_parquet_schema():\n",
    "    # Read one file and print its structure\n",
    "    sample_file = Path(\"./trade_cache\")/\"temp/BAC_20241014.parquet\"\n",
    "    \n",
    "    # Method 1: Using DuckDB describe\n",
    "    print(\"DuckDB Schema:\")\n",
    "    print(duckdb.sql(f\"DESCRIBE SELECT * FROM read_parquet('{sample_file}')\").df())\n",
    "    \n",
    "    # Method 2: Just look at the data\n",
    "    print(\"\\nSample Data:\")\n",
    "    print(duckdb.sql(f\"\"\"\n",
    "        SELECT *\n",
    "        FROM read_parquet('{sample_file}')\n",
    "        LIMIT 5\n",
    "    \"\"\").df())\n",
    "    \n",
    "    # Method 3: Using pandas\n",
    "    print(\"\\nPandas Info:\")\n",
    "    df = pd.read_parquet(sample_file)\n",
    "    print(df.info())\n",
    "\n",
    "# Let's check the schema first\n",
    "check_parquet_schema()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}