Files
ttools/tests/WIP-tradecache_duckdb_approach/hive_cache.ipynb
2024-10-31 13:19:00 +01:00

6.4 KiB

Exploring alternative cache storage using duckdb and parquet

https://claude.ai/chat/e49491f7-8b18-4fb7-b301-5c9997746079

In [1]:
from ttools.tradecache import TradeCache
from ttools.utils import zoneNY
from pathlib import Path
from datetime import datetime
import logging
import duckdb

logging.basicConfig(
    level=logging.INFO,  # Set the minimum level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(levelname)s: %(message)s'  # Simple format showing level and message
)

cache = TradeCache(
    base_path=Path("./trade_cache"),
    max_workers=4,  # Adjust based on your CPU
    cleanup_after_days=7
)

# Load data
df = cache.load_range(
    symbol="BAC",
    start_date=zoneNY.localize(datetime(2024, 10, 14, 9, 30)),
    end_date=zoneNY.localize(datetime(2024, 10, 20, 16, 0)),
    #columns=['open', 'high', 'low', 'close', 'volume']
)

print(f"Loaded {len(df)} rows")
TTOOLS: Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env
Start loading data... 1730370862.4833238
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
fetched parquet -11.310973167419434
Loaded 1836460 rows
In [4]:
import duckdb

def check_parquet_schema():
    # Read one file and print its structure
    sample_file = Path("./trade_cache")/"temp/BAC_20241014.parquet"
    
    # Method 1: Using DuckDB describe
    print("DuckDB Schema:")
    print(duckdb.sql(f"DESCRIBE SELECT * FROM read_parquet('{sample_file}')").df())
    
    # Method 2: Just look at the data
    print("\nSample Data:")
    print(duckdb.sql(f"""
        SELECT *
        FROM read_parquet('{sample_file}')
        LIMIT 5
    """).df())
    
    # Method 3: Using pandas
    print("\nPandas Info:")
    df = pd.read_parquet(sample_file)
    print(df.info())

# Let's check the schema first
check_parquet_schema()
DuckDB Schema:
  column_name               column_type null   key default extra
0           x                   VARCHAR  YES  None    None  None
1           p                    DOUBLE  YES  None    None  None
2           s                    BIGINT  YES  None    None  None
3           i                    BIGINT  YES  None    None  None
4           c                 VARCHAR[]  YES  None    None  None
5           z                   VARCHAR  YES  None    None  None
6           t  TIMESTAMP WITH TIME ZONE  YES  None    None  None

Sample Data:
   x       p       s               i             c  z  \
0  T  41.870      27  62879146994030  [ , F, T, I]  A   
1  D  41.965       1  71675241580848        [ , I]  A   
2  D  41.965       1  71675241644625        [ , I]  A   
3  D  41.850       1  71675241772360        [ , I]  A   
4  N  41.960  416188  52983525028174        [ , O]  A   

                                 t  
0 2024-10-14 15:30:00.006480+02:00  
1 2024-10-14 15:30:00.395802+02:00  
2 2024-10-14 15:30:00.484008+02:00  
3 2024-10-14 15:30:00.610005+02:00  
4 2024-10-14 15:30:01.041599+02:00  

Pandas Info:
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 25
     22     print(df.info())
     24 # Let's check the schema first
---> 25 check_parquet_schema()

Cell In[4], line 21, in check_parquet_schema()
     19 # Method 3: Using pandas
     20 print("\nPandas Info:")
---> 21 df = pd.read_parquet(sample_file)
     22 print(df.info())

NameError: name 'pd' is not defined