6.4 KiB
6.4 KiB
Exploring alternative cache storage using duckdb and parquet
In [1]:
from ttools.tradecache import TradeCache from ttools.utils import zoneNY from pathlib import Path from datetime import datetime import logging import duckdb logging.basicConfig( level=logging.INFO, # Set the minimum level (DEBUG, INFO, WARNING, ERROR, CRITICAL) format='%(levelname)s: %(message)s' # Simple format showing level and message ) cache = TradeCache( base_path=Path("./trade_cache"), max_workers=4, # Adjust based on your CPU cleanup_after_days=7 ) # Load data df = cache.load_range( symbol="BAC", start_date=zoneNY.localize(datetime(2024, 10, 14, 9, 30)), end_date=zoneNY.localize(datetime(2024, 10, 20, 16, 0)), #columns=['open', 'high', 'low', 'close', 'volume'] ) print(f"Loaded {len(df)} rows")
TTOOLS: Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env Start loading data... 1730370862.4833238
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
fetched parquet -11.310973167419434 Loaded 1836460 rows
In [4]:
import duckdb def check_parquet_schema(): # Read one file and print its structure sample_file = Path("./trade_cache")/"temp/BAC_20241014.parquet" # Method 1: Using DuckDB describe print("DuckDB Schema:") print(duckdb.sql(f"DESCRIBE SELECT * FROM read_parquet('{sample_file}')").df()) # Method 2: Just look at the data print("\nSample Data:") print(duckdb.sql(f""" SELECT * FROM read_parquet('{sample_file}') LIMIT 5 """).df()) # Method 3: Using pandas print("\nPandas Info:") df = pd.read_parquet(sample_file) print(df.info()) # Let's check the schema first check_parquet_schema()
DuckDB Schema:
column_name column_type null key default extra
0 x VARCHAR YES None None None
1 p DOUBLE YES None None None
2 s BIGINT YES None None None
3 i BIGINT YES None None None
4 c VARCHAR[] YES None None None
5 z VARCHAR YES None None None
6 t TIMESTAMP WITH TIME ZONE YES None None None
Sample Data:
x p s i c z \
0 T 41.870 27 62879146994030 [ , F, T, I] A
1 D 41.965 1 71675241580848 [ , I] A
2 D 41.965 1 71675241644625 [ , I] A
3 D 41.850 1 71675241772360 [ , I] A
4 N 41.960 416188 52983525028174 [ , O] A
t
0 2024-10-14 15:30:00.006480+02:00
1 2024-10-14 15:30:00.395802+02:00
2 2024-10-14 15:30:00.484008+02:00
3 2024-10-14 15:30:00.610005+02:00
4 2024-10-14 15:30:01.041599+02:00
Pandas Info:
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[4], line 25 22 print(df.info()) 24 # Let's check the schema first ---> 25 check_parquet_schema() Cell In[4], line 21, in check_parquet_schema() 19 # Method 3: Using pandas 20 print("\nPandas Info:") ---> 21 df = pd.read_parquet(sample_file) 22 print(df.info()) NameError: name 'pd' is not defined