Files
v2realbot/research/loading_trades_aggregation.ipynb
2024-05-17 14:09:42 +02:00

8.5 KiB

Loading trades and vectorized aggregation

In [ ]:
import pandas as pd
import numpy as np
from numba import jit
from alpaca.data.historical import StockHistoricalDataClient
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
from alpaca.data.requests import StockTradesRequest
from v2realbot.enums.enums import BarType
import time

from datetime import datetime
from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data
import pyarrow
from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades
import vectorbtpro as vbt

vbt.settings.set_theme("dark")
vbt.settings['plotting']['layout']['width'] = 1280
vbt.settings.plotting.auto_rangebreaks = True
# Set the option to display with pagination
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_rows', 10)  # Number of rows per page
In [ ]:
symbol = "SPY"
#datetime in zoneNY 
day_start = datetime(2024, 5, 15, 9, 30, 0)
day_stop = datetime(2024, 5, 16, 16, 00, 0)
day_start = zoneNY.localize(day_start)
day_stop = zoneNY.localize(day_stop)
#neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?
df = fetch_trades_parallel(symbol, day_start, day_stop, minsize=50) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1, type=BarType.TIME)
#df.info()
ohlcv_df
In [ ]:
df
In [ ]:
basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)
vbt.settings['plotting']['auto_rangebreaks'] = True
basic_data.ohlcv.plot()
In [ ]:
import pickle
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
import gzip

file_path = f"{DATA_DIR}/tradecache/BAC-1709044200-1709067600.cache.gz"

with gzip.open(file_path, 'rb') as fp:
    tradesResponse = pickle.load(fp)

tradesResponse
In [ ]:
def convert_dict_to_multiindex_df(tradesResponse):
    # Create a DataFrame for each key and add the key as part of the MultiIndex
    dfs = []
    for key, values in tradesResponse.items():
        df = pd.DataFrame(values)
        # Rename columns
        # Select and order columns explicitly
        #print(df)
        df = df[['t', 'x', 'p', 's', 'i', 'c','z']]
        df.rename(columns={'t': 'timestamp', 'c': 'conditions', 'p': 'price', 's': 'size', 'x': 'exchange', 'z':'tape', 'i':'id'}, inplace=True)
        df['symbol'] = key  # Add ticker as a column
        df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert 't' from string to datetime before setting it as an index
        df.set_index(['symbol', 'timestamp'], inplace=True)  # Set the multi-level index using both 'ticker' and 't'
        df = df.tz_convert(zoneNY, level='timestamp')
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame with MultiIndex
    final_df = pd.concat(dfs)

    return final_df

# Convert and print the DataFrame
df = convert_dict_to_multiindex_df(tradesResponse)
df
In [ ]:
df.info()
In [ ]:
ohlcv_df.info()
In [ ]:
ohlcv_df.info()
In [ ]:
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1000, type="dollar")
In [ ]:
ohlcv_df.index.strftime('%Y-%m-%d %H').unique()
In [ ]:
#ohlcv_df.groupby(ohlcv_df.index.date).size()
ohlcv_df.head(100)
In [ ]:
df
In [ ]:
#access just BCA
df_filtered = df.loc["BAC"]

df_filtered.info()
In [ ]:
df_filtered= df_filtered.reset_index()
ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()
ticks
timestamps = ticks[:, 0]
In [ ]:
df_filtered= df_filtered.reset_index()
ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()

#timestamp to integer
# Extract the timestamps column (assuming it's the first column)
timestamps = ticks[:, 0]

# Convert the timestamps to Unix timestamps in seconds with microsecond precision
unix_timestamps_s = np.array([ts.timestamp() for ts in timestamps], dtype='float64')

# Replace the original timestamps in the NumPy array with the converted Unix timestamps
ticks[:, 0] = unix_timestamps_s

#ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000  # Convert to Unix timestamp
ticks
In [ ]:
ticks = ticks.astype(np.float64)
ticks
In [ ]:
resolution = 1  # Example resolution of 60 seconds
ohlcv_bars = generate_time_bars_nb(ticks, resolution)
In [ ]:
ohlcv_bars
In [ ]:
# Convert the resulting array back to a DataFrame
columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades']
ohlcv_df = pd.DataFrame(ohlcv_bars, columns=columns)
ohlcv_df['time'] = pd.to_datetime(ohlcv_df['time'], unit='s')
ohlcv_df.set_index('time', inplace=True)
ohlcv_df.index = ohlcv_df.index.tz_localize('UTC').tz_convert(zoneNY)
#ohlcv_df = ohlcv_df.loc["2024-03-1 15:50:00":"2024-03-28 13:40:00"]
#ohlcv_df.index.strftime('%Y-%m-%d %H').unique()

ohlcv_df
In [ ]: