agg cache optimized
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
from .vbtutils import AnchoredIndicator, create_mask_from_window, isrising, isfalling, isrisingc, isfallingc, trades2entries_exits, figs2cell
|
||||
from .vbtindicators import register_custom_inds
|
||||
from .utils import find_dotenv, AggType, zoneNY, zonePRG, zoneUTC
|
||||
from .utils import AggType, zoneNY, zonePRG, zoneUTC
|
||||
from .loaders import load_data, prepare_trade_cache
|
||||
@ -1,14 +1,35 @@
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from appdirs import user_data_dir
|
||||
from ttools.utils import find_dotenv
|
||||
import ttools.utils as utils
|
||||
import os
|
||||
import pytz
|
||||
import vectorbtpro as vbt
|
||||
import pytz
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
def find_dotenv():
|
||||
"""
|
||||
Searches for a .env file in the given directory or its parents and returns the path.
|
||||
|
||||
Args:
|
||||
start_path: The directory to start searching from.
|
||||
|
||||
Returns:
|
||||
Path to the .env file if found, otherwise None.
|
||||
"""
|
||||
try:
|
||||
start_path = __file__
|
||||
except NameError:
|
||||
#print("Notebook probably")
|
||||
start_path = os.getcwd()
|
||||
#print(start_path)
|
||||
|
||||
current_path = Path(start_path)
|
||||
for _ in range(10): # Limit search depth to 5 levels
|
||||
dotenv_path = current_path / '.env'
|
||||
if dotenv_path.exists():
|
||||
return dotenv_path
|
||||
current_path = current_path.parent
|
||||
return None
|
||||
|
||||
ENV_FILE = find_dotenv()
|
||||
|
||||
|
||||
@ -1,8 +1,5 @@
|
||||
|
||||
from ctypes import Union
|
||||
from dotenv import load_dotenv
|
||||
from appdirs import user_data_dir
|
||||
from ttools.utils import find_dotenv
|
||||
from ttools.config import *
|
||||
from datetime import datetime
|
||||
from alpaca.data.historical import StockHistoricalDataClient
|
||||
@ -16,7 +13,7 @@ from time import time as timetime
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from alpaca.data.enums import DataFeed
|
||||
import random
|
||||
from ttools.utils import AggType, fetch_calendar_data, print, set_verbose
|
||||
from ttools.utils import AggType, fetch_calendar_data, print, print_matching_files_info, set_verbose, list_matching_files
|
||||
from tqdm import tqdm
|
||||
import threading
|
||||
from typing import List, Union
|
||||
@ -393,9 +390,25 @@ def load_data(symbol: Union[str, List[str]],
|
||||
excludes_str = ''.join(map(str, exclude_conditions))
|
||||
file_ohlcv = AGG_CACHE / f"{symbol}-{str(agg_type)}-{str(resolution)}-{start_date.strftime('%Y-%m-%dT%H-%M-%S')}-{end_date.strftime('%Y-%m-%dT%H-%M-%S')}-{str(excludes_str)}-{minsize}-{main_session_only}.parquet"
|
||||
|
||||
if not force_remote and file_ohlcv.exists():
|
||||
ohlcv_df = pd.read_parquet(file_ohlcv, engine='pyarrow')
|
||||
print("Loaded from agg_cache", file_ohlcv)
|
||||
#if matching files with same condition and same or wider date span
|
||||
matched_files = list_matching_files(
|
||||
symbol=symbol,
|
||||
agg_type=str(agg_type),
|
||||
resolution=str(resolution),
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
excludes_str=str(excludes_str),
|
||||
minsize=minsize,
|
||||
main_session_only=main_session_only
|
||||
)
|
||||
print("matched agg files", len(matched_files))
|
||||
print_matching_files_info(matched_files)
|
||||
|
||||
if not force_remote and len(matched_files) > 0:
|
||||
ohlcv_df = pd.read_parquet(matched_files[0],
|
||||
engine='pyarrow',
|
||||
filters=[('time', '>=', start_date), ('time', '<=', end_date)])
|
||||
print("Loaded from agg_cache", matched_files[0])
|
||||
return ohlcv_df
|
||||
else:
|
||||
#neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?
|
||||
@ -411,6 +424,11 @@ def load_data(symbol: Union[str, List[str]],
|
||||
ret_dict_df[symbol] = load_data_single(symbol, agg_type, resolution, start_date, end_date, exclude_conditions, minsize, main_session_only, force_remote)
|
||||
|
||||
if return_vbt:
|
||||
try:
|
||||
import vectorbtpro as vbt # Import only when needed
|
||||
except ImportError:
|
||||
raise RuntimeError("vectorbtpro is required for return_vbt. Please install it.")
|
||||
|
||||
return vbt.Data.from_data(vbt.symbol_dict(ret_dict_df), tz_convert=zoneNY)
|
||||
|
||||
return ret_dict_df
|
||||
|
||||
170
ttools/utils.py
170
ttools/utils.py
@ -2,8 +2,10 @@ from pathlib import Path
|
||||
from enum import Enum
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Tuple
|
||||
import re
|
||||
import pytz
|
||||
import calendar
|
||||
from ttools.config import AGG_CACHE
|
||||
import os
|
||||
from alpaca.trading.models import Order, TradeUpdate, Calendar
|
||||
import pandas_market_calendars as mcal
|
||||
@ -26,6 +28,147 @@ def set_verbose(value):
|
||||
global verbose
|
||||
verbose = value
|
||||
|
||||
def parse_filename(filename: str) -> dict:
|
||||
"""Parse filename of AGG_CACHE files into its components using regex.
|
||||
https://claude.ai/chat/b869644b-f542-4812-ad58-d4439c15fa78
|
||||
"""
|
||||
pattern = r"""
|
||||
^
|
||||
([A-Z]+)- # Symbol
|
||||
([^-]+)- # Agg type
|
||||
(\d+)- # Resolution
|
||||
(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2})- # Start date
|
||||
(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2})- # End date
|
||||
([A-Z0-9]+)- # Excludes string
|
||||
(\d+)- # Minsize
|
||||
(True|False) # Main session flag
|
||||
\.parquet$ # File extension
|
||||
"""
|
||||
match = re.match(pattern, filename, re.VERBOSE)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
try:
|
||||
symbol, agg_type, resolution, start_str, end_str, excludes, minsize, main_session = match.groups()
|
||||
|
||||
return {
|
||||
'symbol': symbol,
|
||||
'agg_type': agg_type,
|
||||
'resolution': resolution,
|
||||
'start_date': datetime.strptime(start_str, '%Y-%m-%dT%H-%M-%S'),
|
||||
'end_date': datetime.strptime(end_str, '%Y-%m-%dT%H-%M-%S'),
|
||||
'excludes_str': excludes,
|
||||
'minsize': int(minsize),
|
||||
'main_session_only': main_session == 'True'
|
||||
}
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
def list_matching_files(
|
||||
symbol: str = None,
|
||||
agg_type: str = None,
|
||||
resolution: str = None,
|
||||
start_date: datetime = None,
|
||||
end_date: datetime = None,
|
||||
excludes_str: str = None,
|
||||
minsize: int = None,
|
||||
main_session_only: bool = None
|
||||
) -> list[Path]:
|
||||
"""
|
||||
List all aggregated files in the cache directory matching the specified criteria.
|
||||
If start_date and end_date are provided, returns files that cover this interval
|
||||
(meaning their date range encompasses the requested interval).
|
||||
If a parameter is None, it matches any value for that component.
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Example with all parameters specified
|
||||
specific_files = list_matching_files(
|
||||
symbol="SPY",
|
||||
agg_type="AggType.OHLCV",
|
||||
resolution="12",
|
||||
start_date=datetime(2024, 1, 15, 9, 30),
|
||||
end_date=datetime(2024, 1, 15, 16, 0),
|
||||
excludes_str="4679BCFMOPUVWZ",
|
||||
minsize=100,
|
||||
main_session_only=True
|
||||
)
|
||||
|
||||
print_matching_files_info(specific_files)
|
||||
```
|
||||
"""
|
||||
#make date naive
|
||||
if start_date is not None:
|
||||
start_date = start_date.replace(tzinfo=None)
|
||||
if end_date is not None:
|
||||
end_date = end_date.replace(tzinfo=None)
|
||||
|
||||
agg_cache_dir = AGG_CACHE
|
||||
def matches_criteria(file_info: dict) -> bool:
|
||||
"""Check if file matches all specified criteria."""
|
||||
if not file_info:
|
||||
return False
|
||||
|
||||
# Check non-date criteria first
|
||||
if symbol is not None and file_info['symbol'] != symbol:
|
||||
return False
|
||||
if agg_type is not None and file_info['agg_type'] != agg_type:
|
||||
return False
|
||||
if resolution is not None and file_info['resolution'] != resolution:
|
||||
return False
|
||||
if excludes_str is not None and file_info['excludes_str'] != excludes_str:
|
||||
return False
|
||||
if minsize is not None and file_info['minsize'] != minsize:
|
||||
return False
|
||||
if main_session_only is not None and file_info['main_session_only'] != main_session_only:
|
||||
return False
|
||||
|
||||
# Check date range coverage if both dates are provided
|
||||
if start_date is not None and end_date is not None:
|
||||
return (file_info['start_date'] <= start_date and
|
||||
file_info['end_date'] >= end_date)
|
||||
|
||||
# If only start_date is provided
|
||||
if start_date is not None:
|
||||
return file_info['end_date'] >= start_date
|
||||
|
||||
# If only end_date is provided
|
||||
if end_date is not None:
|
||||
return file_info['start_date'] <= end_date
|
||||
|
||||
return True
|
||||
|
||||
# Process all files
|
||||
matching_files = []
|
||||
for file_path in agg_cache_dir.iterdir():
|
||||
if not file_path.is_file() or not file_path.name.endswith('.parquet'):
|
||||
continue
|
||||
|
||||
file_info = parse_filename(file_path.name)
|
||||
if matches_criteria(file_info):
|
||||
matching_files.append((file_path, file_info))
|
||||
|
||||
# Sort files by start date and then end date
|
||||
matching_files.sort(key=lambda x: (x[1]['start_date'], x[1]['end_date']))
|
||||
|
||||
# Return just the file paths
|
||||
return [f[0] for f in matching_files]
|
||||
|
||||
def print_matching_files_info(files: list[Path]):
|
||||
"""Helper function to print detailed information about matching files."""
|
||||
for file_path in files:
|
||||
file_info = parse_filename(file_path.name)
|
||||
if file_info:
|
||||
print(f"\nFile: {file_path.name}")
|
||||
print(f"Coverage: {file_info['start_date']} to {file_info['end_date']}")
|
||||
print(f"Symbol: {file_info['symbol']}")
|
||||
print(f"Agg Type: {file_info['agg_type']}")
|
||||
print(f"Resolution: {file_info['resolution']}")
|
||||
print(f"Excludes: {file_info['excludes_str']}")
|
||||
print(f"Minsize: {file_info['minsize']}")
|
||||
print(f"Main Session Only: {file_info['main_session_only']}")
|
||||
print("-" * 80)
|
||||
|
||||
def fetch_calendar_data(start: datetime, end: datetime) -> List[Calendar]:
|
||||
"""
|
||||
Fetches the trading schedule for the NYSE (New York Stock Exchange) between the specified start and end dates.
|
||||
@ -109,33 +252,6 @@ def split_range(start: datetime, stop: datetime, period: str = "Y") -> List[Tupl
|
||||
|
||||
return ranges
|
||||
|
||||
|
||||
def find_dotenv():
|
||||
"""
|
||||
Searches for a .env file in the given directory or its parents and returns the path.
|
||||
|
||||
Args:
|
||||
start_path: The directory to start searching from.
|
||||
|
||||
Returns:
|
||||
Path to the .env file if found, otherwise None.
|
||||
"""
|
||||
try:
|
||||
start_path = __file__
|
||||
except NameError:
|
||||
#print("Notebook probably")
|
||||
start_path = os.getcwd()
|
||||
#print(start_path)
|
||||
|
||||
current_path = Path(start_path)
|
||||
for _ in range(10): # Limit search depth to 5 levels
|
||||
dotenv_path = current_path / '.env'
|
||||
if dotenv_path.exists():
|
||||
return dotenv_path
|
||||
current_path = current_path.parent
|
||||
return None
|
||||
|
||||
|
||||
#create enum AGG_TYPE
|
||||
class AggType(str, Enum):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user