agg cache optimized

This commit is contained in:
David Brazda
2024-10-31 13:19:00 +01:00
parent 5770d8324a
commit 47450e2740
9 changed files with 1183 additions and 379 deletions

View File

@ -1,4 +1,4 @@
from .vbtutils import AnchoredIndicator, create_mask_from_window, isrising, isfalling, isrisingc, isfallingc, trades2entries_exits, figs2cell
from .vbtindicators import register_custom_inds
from .utils import find_dotenv, AggType, zoneNY, zonePRG, zoneUTC
from .utils import AggType, zoneNY, zonePRG, zoneUTC
from .loaders import load_data, prepare_trade_cache

View File

@ -1,14 +1,35 @@
from dotenv import load_dotenv
from appdirs import user_data_dir
from ttools.utils import find_dotenv
import ttools.utils as utils
import os
import pytz
import vectorbtpro as vbt
import pytz
from pathlib import Path
from dotenv import load_dotenv
import os
def find_dotenv():
"""
Searches for a .env file in the given directory or its parents and returns the path.
Args:
start_path: The directory to start searching from.
Returns:
Path to the .env file if found, otherwise None.
"""
try:
start_path = __file__
except NameError:
#print("Notebook probably")
start_path = os.getcwd()
#print(start_path)
current_path = Path(start_path)
for _ in range(10): # Limit search depth to 5 levels
dotenv_path = current_path / '.env'
if dotenv_path.exists():
return dotenv_path
current_path = current_path.parent
return None
ENV_FILE = find_dotenv()

View File

@ -1,8 +1,5 @@
from ctypes import Union
from dotenv import load_dotenv
from appdirs import user_data_dir
from ttools.utils import find_dotenv
from ttools.config import *
from datetime import datetime
from alpaca.data.historical import StockHistoricalDataClient
@ -16,7 +13,7 @@ from time import time as timetime
from concurrent.futures import ThreadPoolExecutor
from alpaca.data.enums import DataFeed
import random
from ttools.utils import AggType, fetch_calendar_data, print, set_verbose
from ttools.utils import AggType, fetch_calendar_data, print, print_matching_files_info, set_verbose, list_matching_files
from tqdm import tqdm
import threading
from typing import List, Union
@ -393,9 +390,25 @@ def load_data(symbol: Union[str, List[str]],
excludes_str = ''.join(map(str, exclude_conditions))
file_ohlcv = AGG_CACHE / f"{symbol}-{str(agg_type)}-{str(resolution)}-{start_date.strftime('%Y-%m-%dT%H-%M-%S')}-{end_date.strftime('%Y-%m-%dT%H-%M-%S')}-{str(excludes_str)}-{minsize}-{main_session_only}.parquet"
if not force_remote and file_ohlcv.exists():
ohlcv_df = pd.read_parquet(file_ohlcv, engine='pyarrow')
print("Loaded from agg_cache", file_ohlcv)
#if matching files with same condition and same or wider date span
matched_files = list_matching_files(
symbol=symbol,
agg_type=str(agg_type),
resolution=str(resolution),
start_date=start_date,
end_date=end_date,
excludes_str=str(excludes_str),
minsize=minsize,
main_session_only=main_session_only
)
print("matched agg files", len(matched_files))
print_matching_files_info(matched_files)
if not force_remote and len(matched_files) > 0:
ohlcv_df = pd.read_parquet(matched_files[0],
engine='pyarrow',
filters=[('time', '>=', start_date), ('time', '<=', end_date)])
print("Loaded from agg_cache", matched_files[0])
return ohlcv_df
else:
#neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?
@ -411,6 +424,11 @@ def load_data(symbol: Union[str, List[str]],
ret_dict_df[symbol] = load_data_single(symbol, agg_type, resolution, start_date, end_date, exclude_conditions, minsize, main_session_only, force_remote)
if return_vbt:
try:
import vectorbtpro as vbt # Import only when needed
except ImportError:
raise RuntimeError("vectorbtpro is required for return_vbt. Please install it.")
return vbt.Data.from_data(vbt.symbol_dict(ret_dict_df), tz_convert=zoneNY)
return ret_dict_df

View File

@ -2,8 +2,10 @@ from pathlib import Path
from enum import Enum
from datetime import datetime, timedelta
from typing import List, Tuple
import re
import pytz
import calendar
from ttools.config import AGG_CACHE
import os
from alpaca.trading.models import Order, TradeUpdate, Calendar
import pandas_market_calendars as mcal
@ -26,6 +28,147 @@ def set_verbose(value):
global verbose
verbose = value
def parse_filename(filename: str) -> dict:
"""Parse filename of AGG_CACHE files into its components using regex.
https://claude.ai/chat/b869644b-f542-4812-ad58-d4439c15fa78
"""
pattern = r"""
^
([A-Z]+)- # Symbol
([^-]+)- # Agg type
(\d+)- # Resolution
(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2})- # Start date
(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2})- # End date
([A-Z0-9]+)- # Excludes string
(\d+)- # Minsize
(True|False) # Main session flag
\.parquet$ # File extension
"""
match = re.match(pattern, filename, re.VERBOSE)
if not match:
return None
try:
symbol, agg_type, resolution, start_str, end_str, excludes, minsize, main_session = match.groups()
return {
'symbol': symbol,
'agg_type': agg_type,
'resolution': resolution,
'start_date': datetime.strptime(start_str, '%Y-%m-%dT%H-%M-%S'),
'end_date': datetime.strptime(end_str, '%Y-%m-%dT%H-%M-%S'),
'excludes_str': excludes,
'minsize': int(minsize),
'main_session_only': main_session == 'True'
}
except (ValueError, AttributeError):
return None
def list_matching_files(
symbol: str = None,
agg_type: str = None,
resolution: str = None,
start_date: datetime = None,
end_date: datetime = None,
excludes_str: str = None,
minsize: int = None,
main_session_only: bool = None
) -> list[Path]:
"""
List all aggregated files in the cache directory matching the specified criteria.
If start_date and end_date are provided, returns files that cover this interval
(meaning their date range encompasses the requested interval).
If a parameter is None, it matches any value for that component.
Example:
```python
# Example with all parameters specified
specific_files = list_matching_files(
symbol="SPY",
agg_type="AggType.OHLCV",
resolution="12",
start_date=datetime(2024, 1, 15, 9, 30),
end_date=datetime(2024, 1, 15, 16, 0),
excludes_str="4679BCFMOPUVWZ",
minsize=100,
main_session_only=True
)
print_matching_files_info(specific_files)
```
"""
#make date naive
if start_date is not None:
start_date = start_date.replace(tzinfo=None)
if end_date is not None:
end_date = end_date.replace(tzinfo=None)
agg_cache_dir = AGG_CACHE
def matches_criteria(file_info: dict) -> bool:
"""Check if file matches all specified criteria."""
if not file_info:
return False
# Check non-date criteria first
if symbol is not None and file_info['symbol'] != symbol:
return False
if agg_type is not None and file_info['agg_type'] != agg_type:
return False
if resolution is not None and file_info['resolution'] != resolution:
return False
if excludes_str is not None and file_info['excludes_str'] != excludes_str:
return False
if minsize is not None and file_info['minsize'] != minsize:
return False
if main_session_only is not None and file_info['main_session_only'] != main_session_only:
return False
# Check date range coverage if both dates are provided
if start_date is not None and end_date is not None:
return (file_info['start_date'] <= start_date and
file_info['end_date'] >= end_date)
# If only start_date is provided
if start_date is not None:
return file_info['end_date'] >= start_date
# If only end_date is provided
if end_date is not None:
return file_info['start_date'] <= end_date
return True
# Process all files
matching_files = []
for file_path in agg_cache_dir.iterdir():
if not file_path.is_file() or not file_path.name.endswith('.parquet'):
continue
file_info = parse_filename(file_path.name)
if matches_criteria(file_info):
matching_files.append((file_path, file_info))
# Sort files by start date and then end date
matching_files.sort(key=lambda x: (x[1]['start_date'], x[1]['end_date']))
# Return just the file paths
return [f[0] for f in matching_files]
def print_matching_files_info(files: list[Path]):
"""Helper function to print detailed information about matching files."""
for file_path in files:
file_info = parse_filename(file_path.name)
if file_info:
print(f"\nFile: {file_path.name}")
print(f"Coverage: {file_info['start_date']} to {file_info['end_date']}")
print(f"Symbol: {file_info['symbol']}")
print(f"Agg Type: {file_info['agg_type']}")
print(f"Resolution: {file_info['resolution']}")
print(f"Excludes: {file_info['excludes_str']}")
print(f"Minsize: {file_info['minsize']}")
print(f"Main Session Only: {file_info['main_session_only']}")
print("-" * 80)
def fetch_calendar_data(start: datetime, end: datetime) -> List[Calendar]:
"""
Fetches the trading schedule for the NYSE (New York Stock Exchange) between the specified start and end dates.
@ -109,33 +252,6 @@ def split_range(start: datetime, stop: datetime, period: str = "Y") -> List[Tupl
return ranges
def find_dotenv():
"""
Searches for a .env file in the given directory or its parents and returns the path.
Args:
start_path: The directory to start searching from.
Returns:
Path to the .env file if found, otherwise None.
"""
try:
start_path = __file__
except NameError:
#print("Notebook probably")
start_path = os.getcwd()
#print(start_path)
current_path = Path(start_path)
for _ in range(10): # Limit search depth to 5 levels
dotenv_path = current_path / '.env'
if dotenv_path.exists():
return dotenv_path
current_path = current_path.parent
return None
#create enum AGG_TYPE
class AggType(str, Enum):
"""