agg cache optimized

2024-10-31 13:19:00 +01:00
parent 5770d8324a
commit 47450e2740
9 changed files with 1183 additions and 379 deletions
--- a/tests/WIP-tradecache_duckdb_approach/hive_cache.ipynb
+++ b/tests/WIP-tradecache_duckdb_approach/hive_cache.ipynb
@@ -0,0 +1,178 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Exploring alternative cache storage using duckdb and parquet\n",
+    "\n",
+    "https://claude.ai/chat/e49491f7-8b18-4fb7-b301-5c9997746079\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TTOOLS: Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env\n",
+      "Start loading data... 1730370862.4833238\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "829f7f3d58a74f1fbfdcfc202c2aaf84",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "fetched parquet -11.310973167419434\n",
+      "Loaded 1836460 rows\n"
+     ]
+    }
+   ],
+   "source": [
+    "from ttools.tradecache import TradeCache\n",
+    "from ttools.utils import zoneNY\n",
+    "from pathlib import Path\n",
+    "from datetime import datetime\n",
+    "import logging\n",
+    "import duckdb\n",
+    "\n",
+    "logging.basicConfig(\n",
+    "    level=logging.INFO,  # Set the minimum level (DEBUG, INFO, WARNING, ERROR, CRITICAL)\n",
+    "    format='%(levelname)s: %(message)s'  # Simple format showing level and message\n",
+    ")\n",
+    "\n",
+    "cache = TradeCache(\n",
+    "    base_path=Path(\"./trade_cache\"),\n",
+    "    max_workers=4,  # Adjust based on your CPU\n",
+    "    cleanup_after_days=7\n",
+    ")\n",
+    "\n",
+    "# Load data\n",
+    "df = cache.load_range(\n",
+    "    symbol=\"BAC\",\n",
+    "    start_date=zoneNY.localize(datetime(2024, 10, 14, 9, 30)),\n",
+    "    end_date=zoneNY.localize(datetime(2024, 10, 20, 16, 0)),\n",
+    "    #columns=['open', 'high', 'low', 'close', 'volume']\n",
+    ")\n",
+    "\n",
+    "print(f\"Loaded {len(df)} rows\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DuckDB Schema:\n",
+      "  column_name               column_type null   key default extra\n",
+      "0           x                   VARCHAR  YES  None    None  None\n",
+      "1           p                    DOUBLE  YES  None    None  None\n",
+      "2           s                    BIGINT  YES  None    None  None\n",
+      "3           i                    BIGINT  YES  None    None  None\n",
+      "4           c                 VARCHAR[]  YES  None    None  None\n",
+      "5           z                   VARCHAR  YES  None    None  None\n",
+      "6           t  TIMESTAMP WITH TIME ZONE  YES  None    None  None\n",
+      "\n",
+      "Sample Data:\n",
+      "   x       p       s               i             c  z  \\\n",
+      "0  T  41.870      27  62879146994030  [ , F, T, I]  A   \n",
+      "1  D  41.965       1  71675241580848        [ , I]  A   \n",
+      "2  D  41.965       1  71675241644625        [ , I]  A   \n",
+      "3  D  41.850       1  71675241772360        [ , I]  A   \n",
+      "4  N  41.960  416188  52983525028174        [ , O]  A   \n",
+      "\n",
+      "                                 t  \n",
+      "0 2024-10-14 15:30:00.006480+02:00  \n",
+      "1 2024-10-14 15:30:00.395802+02:00  \n",
+      "2 2024-10-14 15:30:00.484008+02:00  \n",
+      "3 2024-10-14 15:30:00.610005+02:00  \n",
+      "4 2024-10-14 15:30:01.041599+02:00  \n",
+      "\n",
+      "Pandas Info:\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'pd' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[4], line 25\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39minfo())\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# Let's check the schema first\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[43mcheck_parquet_schema\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[4], line 21\u001b[0m, in \u001b[0;36mcheck_parquet_schema\u001b[0;34m()\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;66;03m# Method 3: Using pandas\u001b[39;00m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mPandas Info:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 21\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mread_parquet(sample_file)\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39minfo())\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "import duckdb\n",
+    "\n",
+    "def check_parquet_schema():\n",
+    "    # Read one file and print its structure\n",
+    "    sample_file = Path(\"./trade_cache\")/\"temp/BAC_20241014.parquet\"\n",
+    "    \n",
+    "    # Method 1: Using DuckDB describe\n",
+    "    print(\"DuckDB Schema:\")\n",
+    "    print(duckdb.sql(f\"DESCRIBE SELECT * FROM read_parquet('{sample_file}')\").df())\n",
+    "    \n",
+    "    # Method 2: Just look at the data\n",
+    "    print(\"\\nSample Data:\")\n",
+    "    print(duckdb.sql(f\"\"\"\n",
+    "        SELECT *\n",
+    "        FROM read_parquet('{sample_file}')\n",
+    "        LIMIT 5\n",
+    "    \"\"\").df())\n",
+    "    \n",
+    "    # Method 3: Using pandas\n",
+    "    print(\"\\nPandas Info:\")\n",
+    "    df = pd.read_parquet(sample_file)\n",
+    "    print(df.info())\n",
+    "\n",
+    "# Let's check the schema first\n",
+    "check_parquet_schema()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/tests/WIP-tradecache_duckdb_approach/tradecache.py
+++ b/tests/WIP-tradecache_duckdb_approach/tradecache.py
@@ -0,0 +1,324 @@
+#this goes to the main direcotry
+
+
+from pathlib import Path
+from datetime import datetime, date, timedelta
+from typing import Optional, List, Set, Dict, Tuple
+import pandas as pd
+import duckdb
+import pandas_market_calendars as mcal
+from abc import ABC, abstractmethod
+import logging
+from ttools.utils import zoneNY
+from concurrent.futures import ThreadPoolExecutor
+from ttools.loaders import fetch_daily_stock_trades
+import time
+logger = logging.getLogger(__name__)
+
+class TradeCache:
+    def __init__(
+        self,
+        base_path: Path,
+        market: str = 'NYSE',
+        max_workers: int = 4,
+        cleanup_after_days: int = 7
+    ):
+        """
+        Initialize TradeCache with monthly partitions and temp storage
+        
+        Args:
+            base_path: Base directory for cache
+            market: Market calendar to use
+            max_workers: Max parallel fetches
+            cleanup_after_days: Days after which to clean temp files
+        """
+        """Initialize TradeCache with the same parameters but optimized for the new schema"""
+        self.base_path = Path(base_path)
+        self.temp_path = self.base_path / "temp"
+        self.base_path.mkdir(parents=True, exist_ok=True)
+        self.temp_path.mkdir(parents=True, exist_ok=True)
+        
+        self.calendar = mcal.get_calendar(market)
+        self.max_workers = max_workers
+        self.cleanup_after_days = cleanup_after_days
+        
+        # Initialize DuckDB with schema-specific optimizations
+        self.con = duckdb.connect()
+        self.con.execute("SET memory_limit='16GB'")
+        self.con.execute("SET threads TO 8")
+        
+        # Create the schema for our tables
+        self.schema = """
+            x VARCHAR,
+            p DOUBLE,
+            s BIGINT,
+            i BIGINT,
+            c VARCHAR[],
+            z VARCHAR,
+            t TIMESTAMP WITH TIME ZONE
+        """
+        
+        self._trading_days_cache: Dict[Tuple[date, date], List[date]] = {}
+        
+    def get_partition_path(self, symbol: str, year: int, month: int) -> Path:
+        """Get path for a specific partition"""
+        return self.base_path / f"symbol={symbol}/year={year}/month={month}"
+    
+    def get_temp_path(self, symbol: str, day: date) -> Path:
+        """Get temporary file path for a day"""
+        return self.temp_path / f"{symbol}_{day:%Y%m%d}.parquet"
+    
+    def get_trading_days(self, start_date: datetime, end_date: datetime) -> List[date]:
+        """Get trading days with caching"""
+        key = (start_date.date(), end_date.date())
+        if key not in self._trading_days_cache:
+            schedule = self.calendar.schedule(start_date=start_date, end_date=end_date)
+            self._trading_days_cache[key] = [d.date() for d in schedule.index]
+        return self._trading_days_cache[key]
+    
+    def cleanup_temp_files(self):
+        """Clean up old temp files"""
+        cutoff = datetime.now() - timedelta(days=self.cleanup_after_days)
+        for file in self.temp_path.glob("*.parquet"):
+            try:
+                # Extract date from filename
+                date_str = file.stem.split('_')[1]
+                file_date = datetime.strptime(date_str, '%Y%m%d')
+                if file_date < cutoff:
+                    file.unlink()
+            except Exception as e:
+                logger.warning(f"Error cleaning up {file}: {e}")
+    
+
+    def consolidate_month(self, symbol: str, year: int, month: int) -> bool:
+        """
+        Consolidate daily files into monthly partition only if we have complete month
+        Returns True if consolidation was successful
+        """
+        # Get all temp files for this symbol and month
+        temp_files = list(self.temp_path.glob(f"{symbol}_{year:04d}{month:02d}*.parquet"))
+        
+        if not temp_files:
+            return False
+            
+        try:
+            # Get expected trading days for this month
+            start_date = zoneNY.localize(datetime(year, month, 1))
+            if month == 12:
+                end_date = zoneNY.localize(datetime(year + 1, 1, 1)) - timedelta(days=1)
+            else:
+                end_date = zoneNY.localize(datetime(year, month + 1, 1)) - timedelta(days=1)
+                
+            trading_days = self.get_trading_days(start_date, end_date)
+            
+            # Check if we have data for all trading days
+            temp_dates = set(datetime.strptime(f.stem.split('_')[1], '%Y%m%d').date() 
+                            for f in temp_files)
+            missing_days = set(trading_days) - temp_dates
+            
+            # Only consolidate if we have all trading days
+            if missing_days:
+                logger.info(f"Skipping consolidation for {symbol} {year}-{month}: "
+                        f"missing {len(missing_days)} trading days")
+                return False
+                
+            # Proceed with consolidation since we have complete month
+            partition_path = self.get_partition_path(symbol, year, month)
+            partition_path.mkdir(parents=True, exist_ok=True)
+            file_path = partition_path / "data.parquet"
+            
+            files_str = ', '.join(f"'{f}'" for f in temp_files)
+            
+            # Modified query to handle the new schema
+            self.con.execute(f"""
+                COPY (
+                    SELECT x, p, s, i, c, z, t
+                    FROM read_parquet([{files_str}])
+                    ORDER BY t
+                )
+                TO '{file_path}'
+                (FORMAT PARQUET, COMPRESSION 'ZSTD')
+            """)
+            
+            # Remove temp files only after successful write
+            for f in temp_files:
+                f.unlink()
+                
+            logger.info(f"Successfully consolidated {symbol} {year}-{month} "
+                    f"({len(temp_files)} files)")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Error consolidating {symbol} {year}-{month}: {e}")
+            return False
+    
+    def fetch_remote_day(self, symbol: str, day: date) -> pd.DataFrame:
+        """Implement this to fetch single day of data"""
+        min_datetime = zoneNY.localize(datetime.combine(day, datetime.min.time()))
+        max_datetime = zoneNY.localize(datetime.combine(day, datetime.max.time()))
+        return fetch_daily_stock_trades(symbol, min_datetime, max_datetime)
+    
+    def _fetch_and_save_day(self, symbol: str, day: date) -> Optional[Path]:
+        """Fetch and save a single day, returns file path if successful"""
+        try:
+            df_day = self.fetch_remote_day(symbol, day)
+            if df_day.empty:
+                return None
+                
+            temp_file = self.get_temp_path(symbol, day)
+            df_day.to_parquet(temp_file, compression='ZSTD')
+            return temp_file
+            
+        except Exception as e:
+            logger.error(f"Error fetching {symbol} for {day}: {e}")
+            return None
+    
+    def load_range(
+        self,
+        symbol: str,
+        start_date: datetime,
+        end_date: datetime,
+        columns: Optional[List[str]] = None,
+        consolidate: bool = False
+        ) -> pd.DataFrame:
+        """Load data for date range, consolidating when complete months are detected"""
+        #self.cleanup_temp_files()
+        
+        trading_days = self.get_trading_days(start_date, end_date)
+        
+        # Modify column selection for new schema
+        col_str = '*' if not columns else ', '.join(columns)
+        
+        if consolidate:
+            # First check temp files for complete months
+            temp_files = list(self.temp_path.glob(f"{symbol}_*.parquet"))
+            if temp_files:
+                # Group temp files by month
+                monthly_temps: Dict[Tuple[int, int], Set[date]] = {}
+                for file in temp_files:
+                    try:
+                        # Extract date from filename
+                        date_str = file.stem.split('_')[1]
+                        file_date = datetime.strptime(date_str, '%Y%m%d').date()
+                        key = (file_date.year, file_date.month)
+                        if key not in monthly_temps:
+                            monthly_temps[key] = set()
+                        monthly_temps[key].add(file_date)
+                    except Exception as e:
+                        logger.warning(f"Error parsing temp file date {file}: {e}")
+                        continue
+
+                # Check each month for completeness and consolidate if complete
+                for (year, month), dates in monthly_temps.items():
+                    # Get trading days for this month
+                    month_start = zoneNY.localize(datetime(year, month, 1))
+                    if month == 12:
+                        month_end = zoneNY.localize(datetime(year + 1, 1, 1)) - timedelta(days=1)
+                    else:
+                        month_end = zoneNY.localize(datetime(year, month + 1, 1)) - timedelta(days=1)
+                    
+                    month_trading_days = set(self.get_trading_days(month_start, month_end))
+                    
+                    # If we have all trading days for the month, consolidate
+                    if month_trading_days.issubset(dates):
+                        logger.info(f"Found complete month in temp files for {symbol} {year}-{month}")
+                        self.consolidate_month(symbol, year, month)
+
+        #timing the load
+        time_start = time.time()
+        print("Start loading data...", time_start)
+        # Now load data from both consolidated and temp files
+        query = f"""
+            WITH monthly_data AS (
+                SELECT {col_str}
+                FROM read_parquet(
+                    '{self.base_path}/*/*.parquet',
+                    hive_partitioning=1,
+                    union_by_name=true
+                )
+                WHERE t BETWEEN '{start_date}' AND '{end_date}'
+            ),
+            temp_data AS (
+                SELECT {col_str}
+                FROM read_parquet(
+                    '{self.temp_path}/{symbol}_*.parquet',
+                    union_by_name=true
+                )
+                WHERE t BETWEEN '{start_date}' AND '{end_date}'
+            )
+            SELECT * FROM (
+                SELECT * FROM monthly_data
+                UNION ALL
+                SELECT * FROM temp_data
+            )
+            ORDER BY t
+        """
+        
+        try:
+            df_cached = self.con.execute(query).df()
+        except Exception as e:
+            logger.warning(f"Error reading cached data: {e}")
+            df_cached = pd.DataFrame()
+        
+        print("fetched parquet", time_start - time.time())
+        if not df_cached.empty:
+            cached_days = set(df_cached['t'].dt.date)
+            missing_days = [d for d in trading_days if d not in cached_days]
+        else:
+            missing_days = trading_days
+        
+        # Fetch missing days in parallel
+        if missing_days:
+            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                future_to_day = {
+                    executor.submit(self._fetch_and_save_day, symbol, day): day 
+                    for day in missing_days
+                }
+                
+                for future in future_to_day:
+                    day = future_to_day[future]
+                    try:
+                        temp_file = future.result()
+                        if temp_file:
+                            logger.debug(f"Successfully fetched {symbol} for {day}")
+                    except Exception as e:
+                        logger.error(f"Error processing {symbol} for {day}: {e}")
+            
+            # Check again for complete months after fetching new data
+            temp_files = list(self.temp_path.glob(f"{symbol}_*.parquet"))
+            if temp_files:
+                monthly_temps = {}
+                for file in temp_files:
+                    try:
+                        date_str = file.stem.split('_')[1]
+                        file_date = datetime.strptime(date_str, '%Y%m%d').date()
+                        key = (file_date.year, file_date.month)
+                        if key not in monthly_temps:
+                            monthly_temps[key] = set()
+                        monthly_temps[key].add(file_date)
+                    except Exception as e:
+                        logger.warning(f"Error parsing temp file date {file}: {e}")
+                        continue
+
+                # Check for complete months again
+                for (year, month), dates in monthly_temps.items():
+                    month_start = zoneNY.localize(datetime(year, month, 1))
+                    if month == 12:
+                        month_end = zoneNY.localize(datetime(year + 1, 1, 1)) - timedelta(days=1)
+                    else:
+                        month_end = zoneNY.localize(datetime(year, month + 1, 1)) - timedelta(days=1)
+                    
+                    month_trading_days = set(self.get_trading_days(month_start, month_end))
+                    
+                    if month_trading_days.issubset(dates):
+                        logger.info(f"Found complete month after fetching for {symbol} {year}-{month}")
+                        self.consolidate_month(symbol, year, month)
+            
+            # Load final data including any new fetches
+            try:
+                df_cached = self.con.execute(query).df()
+            except Exception as e:
+                logger.warning(f"Error reading final data: {e}")
+                df_cached = pd.DataFrame()
+        
+        return df_cached.sort_values('t')
--- a/tests/data_loader_tryme.ipynb
+++ b/tests/data_loader_tryme.ipynb