v2realbot/agg_vect.ipynb at f04c3e9f12379c6d8b44b2b63a5b67211ed8cc8a

Files

David Brazda 63c2f7e748 vectorized aggregator, minor changes (#198 )

2024-05-17 14:09:42 +02:00

52 KiB

Raw Blame History

In [1]:

import pandas as pd
import numpy as np
from numba import jit
from alpaca.data.historical import StockHistoricalDataClient
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
from alpaca.data.requests import StockTradesRequest
from v2realbot.enums.enums import BarType
import time

from datetime import datetime
from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data
import pyarrow
from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades
import vectorbtpro as vbt

vbt.settings.set_theme("dark")
vbt.settings['plotting']['layout']['width'] = 1280
vbt.settings.plotting.auto_rangebreaks = True
# Set the option to display with pagination
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_rows', 10)  # Number of rows per page

Activating profile profile1

In [2]:

symbol = "SPY"
#datetime in zoneNY 
day_start = datetime(2024, 5, 15, 9, 30, 0)
day_stop = datetime(2024, 5, 16, 16, 00, 0)
day_start = zoneNY.localize(day_start)
day_stop = zoneNY.localize(day_stop)
#neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?
df = fetch_trades_parallel(symbol, day_start, day_stop, minsize=50) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1, type=BarType.TIME)
#df.info()
ohlcv_df

Calendar data fetch successful 2024-05-15 2024-05-16

Contains 2  market days
Searching cache: SPY-1715779800-1715803200.cache.gz
Searching cache: SPY-1715866200-1715889600.cache.gz
FOUND in CACHE SPY-1715866200-1715889600.cache.gz
FOUND in CACHE SPY-1715779800-1715803200.cache.gz
excluding conditions ['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F']
minsize 50
excluding conditions ['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F']
minsize 50

Out[2]:

	open	high	low	close	volume	trades
time
2024-05-15 09:30:00-04:00	525.8300	525.96	525.830	525.890	163507.0	145.0
2024-05-15 09:30:01-04:00	525.8900	525.91	525.790	525.810	14254.0	93.0
2024-05-15 09:30:02-04:00	525.8200	525.92	525.800	525.860	2937.0	22.0
2024-05-15 09:30:03-04:00	525.8800	525.89	525.850	525.860	5520.0	34.0
2024-05-15 09:30:04-04:00	525.8450	525.87	525.720	525.740	73191.0	289.0
...	...	...	...	...	...	...
2024-05-16 15:59:55-04:00	528.6300	528.65	528.605	528.640	37439.0	139.0
2024-05-16 15:59:56-04:00	528.6400	528.73	528.640	528.696	21836.0	72.0
2024-05-16 15:59:57-04:00	528.7000	528.74	528.680	528.695	11066.0	60.0
2024-05-16 15:59:58-04:00	528.7100	528.83	528.710	528.830	28015.0	65.0
2024-05-16 15:59:59-04:00	528.8298	528.83	528.560	528.660	25043.0	84.0

38150 rows × 6 columns

In [5]:

df

Out[5]:

		exchange	price	size	id	conditions	tape
symbol	timestamp
BAC	2024-03-01 09:30:01.069206528-05:00	N	34.520	456915	52983525028686	[ , Q]	A
	2024-03-01 09:30:01.071717376-05:00	P	34.520	50	52983525359944	[ , I]	A
	2024-03-01 09:30:01.071723776-05:00	P	34.520	50	52983525359945	[ , I]	A
	2024-03-01 09:30:01.072288768-05:00	P	34.510	100	52983525359946	[ ]	A
	2024-03-01 09:30:01.072291840-05:00	P	34.510	300	52983525359947	[ ]	A
	...	...	...	...	...	...	...
	2024-03-01 15:49:58.064368128-05:00	T	34.395	100	62880154539876	[ ]	A
	2024-03-01 15:49:58.077368064-05:00	T	34.395	100	62880154540085	[ ]	A
	2024-03-01 15:49:58.088362240-05:00	T	34.395	100	62880154540101	[ ]	A
	2024-03-01 15:49:58.590776576-05:00	D	34.390	120	71709618548788	[ ]	A
	2024-03-01 15:49:58.591035136-05:00	D	34.395	120	79372108382794	[ ]	A

56737 rows × 6 columns

In [4]:

basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)
vbt.settings['plotting']['auto_rangebreaks'] = True
basic_data.ohlcv.plot()

Out[4]:

FigureWidget({
    'data': [{'close': array([525.89 , 525.81 , 525.86 , ..., 528.695, 528.83 , 528.66 ]),
              'decreasing': {'fillcolor': '#ee534f', 'line': {'color': '#ee534f'}},
              'high': array([525.96, 525.91, 525.92, ..., 528.74, 528.83, 528.83]),
              'increasing': {'fillcolor': '#26a69a', 'line': {'color': '#26a69a'}},
              'low': array([525.83, 525.79, 525.8 , ..., 528.68, 528.71, 528.56]),
              'name': 'OHLC',
              'opacity': 0.75,
              'open': array([525.83  , 525.89  , 525.82  , ..., 528.7   , 528.71  , 528.8298]),
              'type': 'candlestick',
              'uid': 'ace5a21b-2317-4646-b45b-de0447bc533c',
              'x': array([datetime.datetime(2024, 5, 15, 9, 30, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 15, 9, 30, 1, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 15, 9, 30, 2, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          ...,
                          datetime.datetime(2024, 5, 16, 15, 59, 57, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 16, 15, 59, 58, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 16, 15, 59, 59, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>)],
                         dtype=object),
              'xaxis': 'x',
              'yaxis': 'y'},
             {'marker': {'color': array(['#26a69a', '#ee534f', '#26a69a', ..., '#ee534f', '#26a69a', '#ee534f'],
                                        dtype=object),
                         'line': {'width': 0}},
              'name': 'Volume',
              'opacity': 0.5,
              'type': 'bar',
              'uid': '5015d1bc-4c51-4185-aad0-8829974921aa',
              'x': array([datetime.datetime(2024, 5, 15, 9, 30, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 15, 9, 30, 1, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 15, 9, 30, 2, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          ...,
                          datetime.datetime(2024, 5, 16, 15, 59, 57, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 16, 15, 59, 58, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 16, 15, 59, 59, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>)],
                         dtype=object),
              'xaxis': 'x2',
              'y': array([163507.,  14254.,   2937., ...,  11066.,  28015.,  25043.]),
              'yaxis': 'y2'}],
    'layout': {'height': 350,
               'legend': {'orientation': 'h',
                          'traceorder': 'normal',
                          'x': 1,
                          'xanchor': 'right',
                          'y': 1.02,
                          'yanchor': 'bottom'},
               'margin': {'b': 30, 'l': 30, 'r': 30, 't': 30},
               'showlegend': True,
               'template': '...',
               'width': 1280,
               'xaxis': {'anchor': 'y',
                         'domain': [0.0, 1.0],
                         'matches': 'x2',
                         'rangeslider': {'visible': False},
                         'showgrid': True,
                         'showticklabels': False},
               'xaxis2': {'anchor': 'y2', 'domain': [0.0, 1.0], 'showgrid': True},
               'yaxis': {'anchor': 'x', 'domain': [0.3, 1.0], 'showgrid': True},
               'yaxis2': {'anchor': 'x2', 'domain': [0.0, 0.3], 'showgrid': True}}
})

In [ ]:

import pickle
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
import gzip

file_path = f"{DATA_DIR}/tradecache/BAC-1709044200-1709067600.cache.gz"

with gzip.open(file_path, 'rb') as fp:
    tradesResponse = pickle.load(fp)

tradesResponse

In [14]:

def convert_dict_to_multiindex_df(tradesResponse):
    # Create a DataFrame for each key and add the key as part of the MultiIndex
    dfs = []
    for key, values in tradesResponse.items():
        df = pd.DataFrame(values)
        # Rename columns
        # Select and order columns explicitly
        #print(df)
        df = df[['t', 'x', 'p', 's', 'i', 'c','z']]
        df.rename(columns={'t': 'timestamp', 'c': 'conditions', 'p': 'price', 's': 'size', 'x': 'exchange', 'z':'tape', 'i':'id'}, inplace=True)
        df['symbol'] = key  # Add ticker as a column
        df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert 't' from string to datetime before setting it as an index
        df.set_index(['symbol', 'timestamp'], inplace=True)  # Set the multi-level index using both 'ticker' and 't'
        df = df.tz_convert(zoneNY, level='timestamp')
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame with MultiIndex
    final_df = pd.concat(dfs)

    return final_df

# Convert and print the DataFrame
df = convert_dict_to_multiindex_df(tradesResponse)
df

Out[14]:

		exchange	price	size	id	conditions	tape
symbol	timestamp
BAC	2024-02-27 09:30:00.002164736-05:00	P	33.680	638	52983525158993	[ , F, T]	A
	2024-02-27 09:30:00.128029184-05:00	P	33.690	7	52983525159224	[ , I]	A
	2024-02-27 09:30:00.128032256-05:00	P	33.690	7	52983525159225	[ , Q]	A
	2024-02-27 09:30:00.261718272-05:00	K	33.700	3	52983525302111	[ , F, I]	A
	2024-02-27 09:30:00.349298176-05:00	D	33.695	1	71675256256563	[ , I]	A
	...	...	...	...	...	...	...
	2024-02-27 15:59:59.996081408-05:00	T	34.270	1	62880189999698	[ , I]	A
	2024-02-27 15:59:59.996084480-05:00	T	34.270	100	62880189999699	[ ]	A
	2024-02-27 15:59:59.997648384-05:00	N	34.270	400	52983576998465	[ ]	A
	2024-02-27 15:59:59.998087168-05:00	T	34.270	1	62880189999929	[ , I]	A
	2024-02-27 15:59:59.998089984-05:00	T	34.270	100	62880189999930	[ ]	A

169811 rows × 6 columns

In [6]:

df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 169811 entries, (0, 'BAC') to (169810, 'BAC')
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype              
---  ------  --------------   -----              
 0   c       169811 non-null  object             
 1   i       169811 non-null  int64              
 2   p       169811 non-null  float64            
 3   s       169811 non-null  int64              
 4   t       169811 non-null  datetime64[ns, UTC]
 5   x       169811 non-null  object             
 6   z       169811 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(2), object(3)
memory usage: 9.9+ MB

In [4]:

ohlcv_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 46344 entries, 2024-03-01 09:30:00-05:00 to 2024-03-04 15:59:59-05:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    46344 non-null  float64
 1   high    46344 non-null  float64
 2   low     46344 non-null  float64
 3   close   46344 non-null  float64
 4   volume  46344 non-null  float64
 5   trades  46344 non-null  float64
dtypes: float64(6)
memory usage: 2.5 MB

In [6]:

ohlcv_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 46344 entries, 2024-03-01 09:30:00-05:00 to 2024-03-04 15:59:59-05:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    46344 non-null  float64
 1   high    46344 non-null  float64
 2   low     46344 non-null  float64
 3   close   46344 non-null  float64
 4   volume  46344 non-null  float64
 5   trades  46344 non-null  float64
dtypes: float64(6)
memory usage: 2.5 MB

In [3]:

ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1000, type="dollar")

In [5]:

ohlcv_df.index.strftime('%Y-%m-%d %H').unique()

Out[5]:

Index(['2024-03-01 09', '2024-03-01 10', '2024-03-01 11', '2024-03-01 12',
       '2024-03-01 13', '2024-03-01 14', '2024-03-01 15', '2024-03-04 09',
       '2024-03-04 10', '2024-03-04 11', '2024-03-04 12', '2024-03-04 13',
       '2024-03-04 14', '2024-03-04 15'],
      dtype='object', name='time')

In [5]:

#ohlcv_df.groupby(ohlcv_df.index.date).size()
ohlcv_df.head(100)

Out[5]:

	open	high	low	close	volume	trades
time
2024-03-01 09:34:00.145446062-05:00	34.555	34.555	34.555	34.555	28.939372	1.0
2024-03-01 09:34:00.145447016-05:00	34.555	34.555	34.555	34.555	28.939372	1.0
2024-03-01 09:34:00.145447016-05:00	34.555	34.555	34.555	34.555	28.939372	1.0
2024-03-01 09:34:00.145447016-05:00	34.555	34.555	34.555	34.555	28.939372	1.0
2024-03-01 09:34:00.145447016-05:00	34.555	34.555	34.555	34.555	28.939372	1.0
...	...	...	...	...	...	...
2024-03-01 09:34:05.011623859-05:00	34.560	34.560	34.560	34.560	28.935185	1.0
2024-03-01 09:34:05.011623859-05:00	34.560	34.560	34.560	34.560	28.935185	1.0
2024-03-01 09:34:05.011623859-05:00	34.560	34.560	34.560	34.560	28.935185	1.0
2024-03-01 09:34:05.011623859-05:00	34.560	34.560	34.560	34.560	28.935185	1.0
2024-03-01 09:34:05.011623859-05:00	34.560	34.560	34.560	34.560	28.935185	2.0

100 rows × 6 columns

In [6]:

df

Out[6]:

		exchange	price	size	id	conditions	tape
symbol	timestamp
BAC	2024-03-01 09:34:00.145446-05:00	D	34.5550	500.0	71675373899865	[ ]	A
	2024-03-01 09:34:00.864348-05:00	D	34.5563	157.0	71675373958977	[ ]	A
	2024-03-01 09:34:00.960608-05:00	D	34.5500	100.0	71675373961523	[ ]	A
	2024-03-01 09:34:01.584619-05:00	D	34.5550	100.0	71675373965623	[ ]	A
	2024-03-01 09:34:01.793712-05:00	D	34.5550	108.0	71675373966644	[ ]	A
	...	...	...	...	...	...	...
	2024-03-04 15:54:59.940080-05:00	Y	35.1600	63.0	52983525230401	[ , I]	A
	2024-03-04 15:54:59.940107-05:00	Z	35.1550	65.0	52983526682176	[ , I]	A
	2024-03-04 15:54:59.940110-05:00	Z	35.1550	200.0	52983526682177	[ ]	A
	2024-03-04 15:54:59.940113-05:00	Z	35.1600	100.0	52983526682179	[ ]	A
	2024-03-04 15:54:59.940615-05:00	V	35.1500	96.0	56471089803589	[ , I]	A

168972 rows × 6 columns

In [ ]:

#access just BCA
df_filtered = df.loc["BAC"]

df_filtered.info()

In [ ]:

df_filtered= df_filtered.reset_index()
ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()
ticks
timestamps = ticks[:, 0]

In [ ]:

df_filtered= df_filtered.reset_index()
ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()

#timestamp to integer
# Extract the timestamps column (assuming it's the first column)
timestamps = ticks[:, 0]

# Convert the timestamps to Unix timestamps in seconds with microsecond precision
unix_timestamps_s = np.array([ts.timestamp() for ts in timestamps], dtype='float64')

# Replace the original timestamps in the NumPy array with the converted Unix timestamps
ticks[:, 0] = unix_timestamps_s

#ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000  # Convert to Unix timestamp
ticks

In [ ]:

ticks = ticks.astype(np.float64)
ticks

In [ ]:

resolution = 1  # Example resolution of 60 seconds
ohlcv_bars = generate_time_bars_nb(ticks, resolution)

In [ ]:

ohlcv_bars

In [ ]:

# Convert the resulting array back to a DataFrame
columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades']
ohlcv_df = pd.DataFrame(ohlcv_bars, columns=columns)
ohlcv_df['time'] = pd.to_datetime(ohlcv_df['time'], unit='s')
ohlcv_df.set_index('time', inplace=True)
ohlcv_df.index = ohlcv_df.index.tz_localize('UTC').tz_convert(zoneNY)
#ohlcv_df = ohlcv_df.loc["2024-03-1 15:50:00":"2024-03-28 13:40:00"]
#ohlcv_df.index.strftime('%Y-%m-%d %H').unique()

ohlcv_df

In [ ]:

52 KiB Raw Blame History Unescape Escape

52 KiB

Raw Blame History