Files
v2realbot/v2realbot/loader/agg_vect.ipynb
2024-05-17 14:09:42 +02:00

52 KiB
Raw Blame History

In [1]:
import pandas as pd
import numpy as np
from numba import jit
from alpaca.data.historical import StockHistoricalDataClient
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
from alpaca.data.requests import StockTradesRequest
from v2realbot.enums.enums import BarType
import time

from datetime import datetime
from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data
import pyarrow
from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades
import vectorbtpro as vbt

vbt.settings.set_theme("dark")
vbt.settings['plotting']['layout']['width'] = 1280
vbt.settings.plotting.auto_rangebreaks = True
# Set the option to display with pagination
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_rows', 10)  # Number of rows per page
Activating profile profile1
In [2]:
symbol = "SPY"
#datetime in zoneNY 
day_start = datetime(2024, 5, 15, 9, 30, 0)
day_stop = datetime(2024, 5, 16, 16, 00, 0)
day_start = zoneNY.localize(day_start)
day_stop = zoneNY.localize(day_stop)
#neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?
df = fetch_trades_parallel(symbol, day_start, day_stop, minsize=50) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1, type=BarType.TIME)
#df.info()
ohlcv_df
Calendar data fetch successful 2024-05-15 2024-05-16
Contains 2  market days
Searching cache: SPY-1715779800-1715803200.cache.gz
Searching cache: SPY-1715866200-1715889600.cache.gz
FOUND in CACHE SPY-1715866200-1715889600.cache.gz
FOUND in CACHE SPY-1715779800-1715803200.cache.gz
excluding conditions ['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F']
minsize 50
excluding conditions ['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F']
minsize 50
Out[2]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
open high low close volume trades
time
2024-05-15 09:30:00-04:00 525.8300 525.96 525.830 525.890 163507.0 145.0
2024-05-15 09:30:01-04:00 525.8900 525.91 525.790 525.810 14254.0 93.0
2024-05-15 09:30:02-04:00 525.8200 525.92 525.800 525.860 2937.0 22.0
2024-05-15 09:30:03-04:00 525.8800 525.89 525.850 525.860 5520.0 34.0
2024-05-15 09:30:04-04:00 525.8450 525.87 525.720 525.740 73191.0 289.0
... ... ... ... ... ... ...
2024-05-16 15:59:55-04:00 528.6300 528.65 528.605 528.640 37439.0 139.0
2024-05-16 15:59:56-04:00 528.6400 528.73 528.640 528.696 21836.0 72.0
2024-05-16 15:59:57-04:00 528.7000 528.74 528.680 528.695 11066.0 60.0
2024-05-16 15:59:58-04:00 528.7100 528.83 528.710 528.830 28015.0 65.0
2024-05-16 15:59:59-04:00 528.8298 528.83 528.560 528.660 25043.0 84.0

38150 rows × 6 columns

In [5]:
df
Out[5]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-03-01 09:30:01.069206528-05:00 N 34.520 456915 52983525028686 [ , Q] A
2024-03-01 09:30:01.071717376-05:00 P 34.520 50 52983525359944 [ , I] A
2024-03-01 09:30:01.071723776-05:00 P 34.520 50 52983525359945 [ , I] A
2024-03-01 09:30:01.072288768-05:00 P 34.510 100 52983525359946 [ ] A
2024-03-01 09:30:01.072291840-05:00 P 34.510 300 52983525359947 [ ] A
... ... ... ... ... ... ...
2024-03-01 15:49:58.064368128-05:00 T 34.395 100 62880154539876 [ ] A
2024-03-01 15:49:58.077368064-05:00 T 34.395 100 62880154540085 [ ] A
2024-03-01 15:49:58.088362240-05:00 T 34.395 100 62880154540101 [ ] A
2024-03-01 15:49:58.590776576-05:00 D 34.390 120 71709618548788 [ ] A
2024-03-01 15:49:58.591035136-05:00 D 34.395 120 79372108382794 [ ] A

56737 rows × 6 columns

In [4]:
basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)
vbt.settings['plotting']['auto_rangebreaks'] = True
basic_data.ohlcv.plot()
Out[4]:
FigureWidget({
    'data': [{'close': array([525.89 , 525.81 , 525.86 , ..., 528.695, 528.83 , 528.66 ]),
              'decreasing': {'fillcolor': '#ee534f', 'line': {'color': '#ee534f'}},
              'high': array([525.96, 525.91, 525.92, ..., 528.74, 528.83, 528.83]),
              'increasing': {'fillcolor': '#26a69a', 'line': {'color': '#26a69a'}},
              'low': array([525.83, 525.79, 525.8 , ..., 528.68, 528.71, 528.56]),
              'name': 'OHLC',
              'opacity': 0.75,
              'open': array([525.83  , 525.89  , 525.82  , ..., 528.7   , 528.71  , 528.8298]),
              'type': 'candlestick',
              'uid': 'ace5a21b-2317-4646-b45b-de0447bc533c',
              'x': array([datetime.datetime(2024, 5, 15, 9, 30, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 15, 9, 30, 1, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 15, 9, 30, 2, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          ...,
                          datetime.datetime(2024, 5, 16, 15, 59, 57, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 16, 15, 59, 58, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 16, 15, 59, 59, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>)],
                         dtype=object),
              'xaxis': 'x',
              'yaxis': 'y'},
             {'marker': {'color': array(['#26a69a', '#ee534f', '#26a69a', ..., '#ee534f', '#26a69a', '#ee534f'],
                                        dtype=object),
                         'line': {'width': 0}},
              'name': 'Volume',
              'opacity': 0.5,
              'type': 'bar',
              'uid': '5015d1bc-4c51-4185-aad0-8829974921aa',
              'x': array([datetime.datetime(2024, 5, 15, 9, 30, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 15, 9, 30, 1, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 15, 9, 30, 2, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          ...,
                          datetime.datetime(2024, 5, 16, 15, 59, 57, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 16, 15, 59, 58, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2024, 5, 16, 15, 59, 59, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>)],
                         dtype=object),
              'xaxis': 'x2',
              'y': array([163507.,  14254.,   2937., ...,  11066.,  28015.,  25043.]),
              'yaxis': 'y2'}],
    'layout': {'height': 350,
               'legend': {'orientation': 'h',
                          'traceorder': 'normal',
                          'x': 1,
                          'xanchor': 'right',
                          'y': 1.02,
                          'yanchor': 'bottom'},
               'margin': {'b': 30, 'l': 30, 'r': 30, 't': 30},
               'showlegend': True,
               'template': '...',
               'width': 1280,
               'xaxis': {'anchor': 'y',
                         'domain': [0.0, 1.0],
                         'matches': 'x2',
                         'rangeslider': {'visible': False},
                         'showgrid': True,
                         'showticklabels': False},
               'xaxis2': {'anchor': 'y2', 'domain': [0.0, 1.0], 'showgrid': True},
               'yaxis': {'anchor': 'x', 'domain': [0.3, 1.0], 'showgrid': True},
               'yaxis2': {'anchor': 'x2', 'domain': [0.0, 0.3], 'showgrid': True}}
})
In [ ]:
import pickle
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
import gzip

file_path = f"{DATA_DIR}/tradecache/BAC-1709044200-1709067600.cache.gz"

with gzip.open(file_path, 'rb') as fp:
    tradesResponse = pickle.load(fp)

tradesResponse
In [14]:
def convert_dict_to_multiindex_df(tradesResponse):
    # Create a DataFrame for each key and add the key as part of the MultiIndex
    dfs = []
    for key, values in tradesResponse.items():
        df = pd.DataFrame(values)
        # Rename columns
        # Select and order columns explicitly
        #print(df)
        df = df[['t', 'x', 'p', 's', 'i', 'c','z']]
        df.rename(columns={'t': 'timestamp', 'c': 'conditions', 'p': 'price', 's': 'size', 'x': 'exchange', 'z':'tape', 'i':'id'}, inplace=True)
        df['symbol'] = key  # Add ticker as a column
        df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert 't' from string to datetime before setting it as an index
        df.set_index(['symbol', 'timestamp'], inplace=True)  # Set the multi-level index using both 'ticker' and 't'
        df = df.tz_convert(zoneNY, level='timestamp')
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame with MultiIndex
    final_df = pd.concat(dfs)

    return final_df

# Convert and print the DataFrame
df = convert_dict_to_multiindex_df(tradesResponse)
df
Out[14]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-02-27 09:30:00.002164736-05:00 P 33.680 638 52983525158993 [ , F, T] A
2024-02-27 09:30:00.128029184-05:00 P 33.690 7 52983525159224 [ , I] A
2024-02-27 09:30:00.128032256-05:00 P 33.690 7 52983525159225 [ , Q] A
2024-02-27 09:30:00.261718272-05:00 K 33.700 3 52983525302111 [ , F, I] A
2024-02-27 09:30:00.349298176-05:00 D 33.695 1 71675256256563 [ , I] A
... ... ... ... ... ... ...
2024-02-27 15:59:59.996081408-05:00 T 34.270 1 62880189999698 [ , I] A
2024-02-27 15:59:59.996084480-05:00 T 34.270 100 62880189999699 [ ] A
2024-02-27 15:59:59.997648384-05:00 N 34.270 400 52983576998465 [ ] A
2024-02-27 15:59:59.998087168-05:00 T 34.270 1 62880189999929 [ , I] A
2024-02-27 15:59:59.998089984-05:00 T 34.270 100 62880189999930 [ ] A

169811 rows × 6 columns

In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 169811 entries, (0, 'BAC') to (169810, 'BAC')
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype              
---  ------  --------------   -----              
 0   c       169811 non-null  object             
 1   i       169811 non-null  int64              
 2   p       169811 non-null  float64            
 3   s       169811 non-null  int64              
 4   t       169811 non-null  datetime64[ns, UTC]
 5   x       169811 non-null  object             
 6   z       169811 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(2), object(3)
memory usage: 9.9+ MB
In [4]:
ohlcv_df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 46344 entries, 2024-03-01 09:30:00-05:00 to 2024-03-04 15:59:59-05:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    46344 non-null  float64
 1   high    46344 non-null  float64
 2   low     46344 non-null  float64
 3   close   46344 non-null  float64
 4   volume  46344 non-null  float64
 5   trades  46344 non-null  float64
dtypes: float64(6)
memory usage: 2.5 MB
In [6]:
ohlcv_df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 46344 entries, 2024-03-01 09:30:00-05:00 to 2024-03-04 15:59:59-05:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    46344 non-null  float64
 1   high    46344 non-null  float64
 2   low     46344 non-null  float64
 3   close   46344 non-null  float64
 4   volume  46344 non-null  float64
 5   trades  46344 non-null  float64
dtypes: float64(6)
memory usage: 2.5 MB
In [3]:
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1000, type="dollar")
In [5]:
ohlcv_df.index.strftime('%Y-%m-%d %H').unique()
Out[5]:
Index(['2024-03-01 09', '2024-03-01 10', '2024-03-01 11', '2024-03-01 12',
       '2024-03-01 13', '2024-03-01 14', '2024-03-01 15', '2024-03-04 09',
       '2024-03-04 10', '2024-03-04 11', '2024-03-04 12', '2024-03-04 13',
       '2024-03-04 14', '2024-03-04 15'],
      dtype='object', name='time')
In [5]:
#ohlcv_df.groupby(ohlcv_df.index.date).size()
ohlcv_df.head(100)
Out[5]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
open high low close volume trades
time
2024-03-01 09:34:00.145446062-05:00 34.555 34.555 34.555 34.555 28.939372 1.0
2024-03-01 09:34:00.145447016-05:00 34.555 34.555 34.555 34.555 28.939372 1.0
2024-03-01 09:34:00.145447016-05:00 34.555 34.555 34.555 34.555 28.939372 1.0
2024-03-01 09:34:00.145447016-05:00 34.555 34.555 34.555 34.555 28.939372 1.0
2024-03-01 09:34:00.145447016-05:00 34.555 34.555 34.555 34.555 28.939372 1.0
... ... ... ... ... ... ...
2024-03-01 09:34:05.011623859-05:00 34.560 34.560 34.560 34.560 28.935185 1.0
2024-03-01 09:34:05.011623859-05:00 34.560 34.560 34.560 34.560 28.935185 1.0
2024-03-01 09:34:05.011623859-05:00 34.560 34.560 34.560 34.560 28.935185 1.0
2024-03-01 09:34:05.011623859-05:00 34.560 34.560 34.560 34.560 28.935185 1.0
2024-03-01 09:34:05.011623859-05:00 34.560 34.560 34.560 34.560 28.935185 2.0

100 rows × 6 columns

In [6]:
df
Out[6]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-03-01 09:34:00.145446-05:00 D 34.5550 500.0 71675373899865 [ ] A
2024-03-01 09:34:00.864348-05:00 D 34.5563 157.0 71675373958977 [ ] A
2024-03-01 09:34:00.960608-05:00 D 34.5500 100.0 71675373961523 [ ] A
2024-03-01 09:34:01.584619-05:00 D 34.5550 100.0 71675373965623 [ ] A
2024-03-01 09:34:01.793712-05:00 D 34.5550 108.0 71675373966644 [ ] A
... ... ... ... ... ... ...
2024-03-04 15:54:59.940080-05:00 Y 35.1600 63.0 52983525230401 [ , I] A
2024-03-04 15:54:59.940107-05:00 Z 35.1550 65.0 52983526682176 [ , I] A
2024-03-04 15:54:59.940110-05:00 Z 35.1550 200.0 52983526682177 [ ] A
2024-03-04 15:54:59.940113-05:00 Z 35.1600 100.0 52983526682179 [ ] A
2024-03-04 15:54:59.940615-05:00 V 35.1500 96.0 56471089803589 [ , I] A

168972 rows × 6 columns

In [ ]:
#access just BCA
df_filtered = df.loc["BAC"]

df_filtered.info()
In [ ]:
df_filtered= df_filtered.reset_index()
ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()
ticks
timestamps = ticks[:, 0]
In [ ]:
df_filtered= df_filtered.reset_index()
ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()

#timestamp to integer
# Extract the timestamps column (assuming it's the first column)
timestamps = ticks[:, 0]

# Convert the timestamps to Unix timestamps in seconds with microsecond precision
unix_timestamps_s = np.array([ts.timestamp() for ts in timestamps], dtype='float64')

# Replace the original timestamps in the NumPy array with the converted Unix timestamps
ticks[:, 0] = unix_timestamps_s

#ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000  # Convert to Unix timestamp
ticks
In [ ]:
ticks = ticks.astype(np.float64)
ticks
In [ ]:
resolution = 1  # Example resolution of 60 seconds
ohlcv_bars = generate_time_bars_nb(ticks, resolution)
In [ ]:
ohlcv_bars
In [ ]:
# Convert the resulting array back to a DataFrame
columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades']
ohlcv_df = pd.DataFrame(ohlcv_bars, columns=columns)
ohlcv_df['time'] = pd.to_datetime(ohlcv_df['time'], unit='s')
ohlcv_df.set_index('time', inplace=True)
ohlcv_df.index = ohlcv_df.index.tz_localize('UTC').tz_convert(zoneNY)
#ohlcv_df = ohlcv_df.loc["2024-03-1 15:50:00":"2024-03-28 13:40:00"]
#ohlcv_df.index.strftime('%Y-%m-%d %H').unique()

ohlcv_df
In [ ]: