agg cache optimized

2024-10-31 13:19:00 +01:00
parent 5770d8324a
commit 47450e2740
9 changed files with 1183 additions and 379 deletions
--- a/ttools/init.py
+++ b/ttools/init.py
@ -1,4 +1,4 @@
 from .vbtutils import AnchoredIndicator, create_mask_from_window, isrising, isfalling, isrisingc, isfallingc, trades2entries_exits, figs2cell
 from .vbtindicators import register_custom_inds
-from .utils import find_dotenv, AggType, zoneNY, zonePRG, zoneUTC
+from .utils import AggType, zoneNY, zonePRG, zoneUTC
 from .loaders import load_data, prepare_trade_cache
--- a/ttools/config.py
+++ b/ttools/config.py
@ -1,14 +1,35 @@

 from dotenv import load_dotenv
 from appdirs import user_data_dir
-from ttools.utils import find_dotenv
+import ttools.utils as utils
 import os
 import pytz
-import vectorbtpro as vbt
-import pytz
 from pathlib import Path
 from dotenv import load_dotenv
-import os
+def find_dotenv():
+    """
+    Searches for a .env file in the given directory or its parents and returns the path.
+
+    Args:
+        start_path: The directory to start searching from.
+
+    Returns:
+        Path to the .env file if found, otherwise None.
+    """
+    try:
+        start_path = __file__
+    except NameError:
+        #print("Notebook probably")
+        start_path = os.getcwd()  
+        #print(start_path)       
+
+    current_path = Path(start_path)
+    for _ in range(10):  # Limit search depth to 5 levels
+        dotenv_path = current_path / '.env'
+        if dotenv_path.exists():
+            return dotenv_path
+        current_path = current_path.parent
+    return None

 ENV_FILE = find_dotenv()

--- a/ttools/loaders.py
+++ b/ttools/loaders.py
@ -1,8 +1,5 @@

 from ctypes import Union
-from dotenv import load_dotenv
-from appdirs import user_data_dir
-from ttools.utils import find_dotenv
 from ttools.config import *
 from datetime import datetime
 from alpaca.data.historical import StockHistoricalDataClient
@ -16,7 +13,7 @@ from time import time as timetime
 from concurrent.futures import ThreadPoolExecutor
 from alpaca.data.enums import DataFeed
 import random
-from ttools.utils import AggType, fetch_calendar_data, print, set_verbose
+from ttools.utils import AggType, fetch_calendar_data, print, print_matching_files_info, set_verbose, list_matching_files
 from tqdm import tqdm
 import threading
 from typing import List, Union
@ -393,9 +390,25 @@ def load_data(symbol: Union[str, List[str]],
        excludes_str = ''.join(map(str, exclude_conditions))  
        file_ohlcv = AGG_CACHE / f"{symbol}-{str(agg_type)}-{str(resolution)}-{start_date.strftime('%Y-%m-%dT%H-%M-%S')}-{end_date.strftime('%Y-%m-%dT%H-%M-%S')}-{str(excludes_str)}-{minsize}-{main_session_only}.parquet"

-        if not force_remote and file_ohlcv.exists():
-            ohlcv_df = pd.read_parquet(file_ohlcv, engine='pyarrow')
-            print("Loaded from agg_cache", file_ohlcv)
+        #if matching files with same condition and same or wider date span
+        matched_files = list_matching_files(
+            symbol=symbol,
+            agg_type=str(agg_type),
+            resolution=str(resolution),
+            start_date=start_date,
+            end_date=end_date,
+            excludes_str=str(excludes_str),
+            minsize=minsize,
+            main_session_only=main_session_only
+        )
+        print("matched agg files", len(matched_files))
+        print_matching_files_info(matched_files)
+
+        if not force_remote and len(matched_files) > 0:
+            ohlcv_df = pd.read_parquet(matched_files[0],
+                                       engine='pyarrow',
+                                       filters=[('time', '>=', start_date), ('time', '<=', end_date)])
+            print("Loaded from agg_cache", matched_files[0])
            return ohlcv_df
        else:
            #neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?
@ -411,6 +424,11 @@ def load_data(symbol: Union[str, List[str]],
        ret_dict_df[symbol] = load_data_single(symbol, agg_type, resolution, start_date, end_date, exclude_conditions, minsize, main_session_only, force_remote)

    if return_vbt:
+        try:
+            import vectorbtpro as vbt  # Import only when needed
+        except ImportError:
+            raise RuntimeError("vectorbtpro is required for return_vbt. Please install it.")
+                
        return vbt.Data.from_data(vbt.symbol_dict(ret_dict_df), tz_convert=zoneNY)
    
    return ret_dict_df
--- a/ttools/utils.py
+++ b/ttools/utils.py
@ -2,8 +2,10 @@ from pathlib import Path
 from enum import Enum
 from datetime import datetime, timedelta
 from typing import List, Tuple
+import re
 import pytz
 import calendar
+from ttools.config import AGG_CACHE
 import os
 from alpaca.trading.models import Order, TradeUpdate, Calendar
 import pandas_market_calendars as mcal
@ -26,6 +28,147 @@ def set_verbose(value):
    global verbose
    verbose = value

+def parse_filename(filename: str) -> dict:
+    """Parse filename of AGG_CACHE files into its components using regex.
+    https://claude.ai/chat/b869644b-f542-4812-ad58-d4439c15fa78
+    """
+    pattern = r"""
+        ^
+        ([A-Z]+)-                     # Symbol
+        ([^-]+)-                      # Agg type
+        (\d+)-                        # Resolution
+        (\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2})- # Start date
+        (\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2})- # End date
+        ([A-Z0-9]+)-                  # Excludes string
+        (\d+)-                        # Minsize
+        (True|False)                  # Main session flag
+        \.parquet$                    # File extension
+    """
+    match = re.match(pattern, filename, re.VERBOSE)
+    if not match:
+        return None
+    
+    try:
+        symbol, agg_type, resolution, start_str, end_str, excludes, minsize, main_session = match.groups()
+        
+        return {
+            'symbol': symbol,
+            'agg_type': agg_type,
+            'resolution': resolution,
+            'start_date': datetime.strptime(start_str, '%Y-%m-%dT%H-%M-%S'),
+            'end_date': datetime.strptime(end_str, '%Y-%m-%dT%H-%M-%S'),
+            'excludes_str': excludes,
+            'minsize': int(minsize),
+            'main_session_only': main_session == 'True'
+        }
+    except (ValueError, AttributeError):
+        return None
+
+def list_matching_files(
+    symbol: str = None,
+    agg_type: str = None,
+    resolution: str = None,
+    start_date: datetime = None,
+    end_date: datetime = None,
+    excludes_str: str = None,
+    minsize: int = None,
+    main_session_only: bool = None
+) -> list[Path]:
+    """
+    List all aggregated files in the cache directory matching the specified criteria.
+    If start_date and end_date are provided, returns files that cover this interval
+    (meaning their date range encompasses the requested interval).
+    If a parameter is None, it matches any value for that component.
+
+    Example:
+    ```python
+    # Example with all parameters specified
+    specific_files = list_matching_files(
+        symbol="SPY",
+        agg_type="AggType.OHLCV",
+        resolution="12",
+        start_date=datetime(2024, 1, 15, 9, 30),
+        end_date=datetime(2024, 1, 15, 16, 0),
+        excludes_str="4679BCFMOPUVWZ",
+        minsize=100,
+        main_session_only=True
+    )
+
+    print_matching_files_info(specific_files)
+    ```
+    """
+    #make date naive
+    if start_date is not None:
+        start_date = start_date.replace(tzinfo=None)
+    if end_date is not None:
+        end_date = end_date.replace(tzinfo=None)
+
+    agg_cache_dir = AGG_CACHE
+    def matches_criteria(file_info: dict) -> bool:
+        """Check if file matches all specified criteria."""
+        if not file_info:
+            return False
+            
+        # Check non-date criteria first
+        if symbol is not None and file_info['symbol'] != symbol:
+            return False
+        if agg_type is not None and file_info['agg_type'] != agg_type:
+            return False
+        if resolution is not None and file_info['resolution'] != resolution:
+            return False
+        if excludes_str is not None and file_info['excludes_str'] != excludes_str:
+            return False
+        if minsize is not None and file_info['minsize'] != minsize:
+            return False
+        if main_session_only is not None and file_info['main_session_only'] != main_session_only:
+            return False
+            
+        # Check date range coverage if both dates are provided
+        if start_date is not None and end_date is not None:
+            return (file_info['start_date'] <= start_date and 
+                   file_info['end_date'] >= end_date)
+        
+        # If only start_date is provided
+        if start_date is not None:
+            return file_info['end_date'] >= start_date
+            
+        # If only end_date is provided
+        if end_date is not None:
+            return file_info['start_date'] <= end_date
+            
+        return True
+
+    # Process all files
+    matching_files = []
+    for file_path in agg_cache_dir.iterdir():
+        if not file_path.is_file() or not file_path.name.endswith('.parquet'):
+            continue
+            
+        file_info = parse_filename(file_path.name)
+        if matches_criteria(file_info):
+            matching_files.append((file_path, file_info))
+    
+    # Sort files by start date and then end date
+    matching_files.sort(key=lambda x: (x[1]['start_date'], x[1]['end_date']))
+    
+    # Return just the file paths
+    return [f[0] for f in matching_files]
+
+def print_matching_files_info(files: list[Path]):
+    """Helper function to print detailed information about matching files."""
+    for file_path in files:
+        file_info = parse_filename(file_path.name)
+        if file_info:
+            print(f"\nFile: {file_path.name}")
+            print(f"Coverage: {file_info['start_date']} to {file_info['end_date']}")
+            print(f"Symbol: {file_info['symbol']}")
+            print(f"Agg Type: {file_info['agg_type']}")
+            print(f"Resolution: {file_info['resolution']}")
+            print(f"Excludes: {file_info['excludes_str']}")
+            print(f"Minsize: {file_info['minsize']}")
+            print(f"Main Session Only: {file_info['main_session_only']}")
+            print("-" * 80)
+
 def fetch_calendar_data(start: datetime, end: datetime) -> List[Calendar]:
    """
    Fetches the trading schedule for the NYSE (New York Stock Exchange) between the specified start and end dates.
@ -109,33 +252,6 @@ def split_range(start: datetime, stop: datetime, period: str = "Y") -> List[Tupl
    
    return ranges

-
-def find_dotenv():
-    """
-    Searches for a .env file in the given directory or its parents and returns the path.
-
-    Args:
-        start_path: The directory to start searching from.
-
-    Returns:
-        Path to the .env file if found, otherwise None.
-    """
-    try:
-        start_path = __file__
-    except NameError:
-        #print("Notebook probably")
-        start_path = os.getcwd()  
-        #print(start_path)       
-
-    current_path = Path(start_path)
-    for _ in range(10):  # Limit search depth to 5 levels
-        dotenv_path = current_path / '.env'
-        if dotenv_path.exists():
-            return dotenv_path
-        current_path = current_path.parent
-    return None
-
-
 #create enum AGG_TYPE
 class AggType(str, Enum):
    """