Files
v2realbot/research/prepare_aggregatied_data.ipynb

57 KiB
Raw Blame History

Loading trades and vectorized aggregation

Describes how to fetch trades (remote/cached) and use new vectorized aggregation to aggregate bars of given type (time, volume, dollar) and resolution

fetch_trades_parallel enables to fetch trades of given symbol and interval, also can filter conditions and minimum size. return trades_df aggregate_trades acceptss trades_df and ressolution and type of bars (VOLUME, TIME, DOLLAR) and return aggregated ohlcv dataframe ohlcv_df

In [20]:
import pandas as pd
import numpy as np
from numba import jit
from alpaca.data.historical import StockHistoricalDataClient
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
from alpaca.data.requests import StockTradesRequest
from v2realbot.enums.enums import BarType
import time
from datetime import datetime
from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data
import pyarrow
from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades
import vectorbtpro as vbt
import v2realbot.utils.config_handler as cfh

vbt.settings.set_theme("dark")
vbt.settings['plotting']['layout']['width'] = 1280
vbt.settings.plotting.auto_rangebreaks = True
# Set the option to display with pagination
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_rows', 20)  # Number of rows per page
# pd.set_option('display.float_format', '{:.9f}'.format)


#trade filtering
exclude_conditions = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES') #standard ['C','O','4','B','7','V','P','W','U','Z','F']
minsize = 100

symbol = "BAC"
#datetime in zoneNY 
day_start = datetime(2023, 1, 1, 9, 30, 0)
day_stop = datetime(2024, 5, 25, 15, 30, 0)
day_start = zoneNY.localize(day_start)
day_stop = zoneNY.localize(day_stop)
#filename of trades_df parquet, date are in isoformat but without time zone part
dir = DATA_DIR + "/notebooks/"
#parquet interval cache contains exclude conditions and minsize filtering
file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet"
#file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}.parquet"
file_ohlcv = dir + f"ohlcv_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet"
print(file_trades)
print(file_ohlcv)
#PRINT all parquet in directory
import os
files = [f for f in os.listdir(dir) if f.endswith(".parquet")]
for f in files:
    print(f)
/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet
trades_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet
trades_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet
trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet
trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet
ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet
ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
In [18]:
#Either load trades or ohlcv from parquet if exists
#trades_df = fetch_trades_parallel(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=minsize, max_workers=30) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])
#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
#filenames = [dir+"trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet",dir+"trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet"]
trades_df = pd.read_parquet(dir+"trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet",engine='pyarrow')
#focused = trades_df.loc["2024-02-16 11:23:11":"2024-02-16 11:24:26"]
#focused
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=trades_df, resolution=1, type=BarType.TIME)
ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow', compression='gzip')

#ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')
# trades_df = pd.read_parquet(file_trades,engine='pyarrow')
In [19]:
ohlcv_df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4549772 entries, 2023-01-03 09:30:01-05:00 to 2024-05-24 15:59:59-04:00
Data columns (total 10 columns):
 #   Column      Dtype                     
---  ------      -----                     
 0   open        float64                   
 1   high        float64                   
 2   low         float64                   
 3   close       float64                   
 4   volume      float64                   
 5   trades      float64                   
 6   updated     datetime64[ns, US/Eastern]
 7   vwap        float64                   
 8   buyvolume   float64                   
 9   sellvolume  float64                   
dtypes: datetime64[ns, US/Eastern](1), float64(9)
memory usage: 381.8 MB
In [16]:
a = trades_df.loc[("BAC", "2024-02-16 09:30"):("BAC","2024-02-16 09:32:11")]
a
Out[16]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-02-16 09:30:00.708657-05:00 P 33.92 200 52983525230899 [ ] A
2024-02-16 09:30:00.708657-05:00 P 33.92 200 52983525230900 [ , Q] A
2024-02-16 09:30:00.708660-05:00 P 33.92 200 52983525230901 [ ] A
2024-02-16 09:30:00.708687-05:00 P 33.92 200 52983525230903 [ ] A
2024-02-16 09:30:00.708744-05:00 P 33.92 100 52983525230904 [ ] A
... ... ... ... ... ... ...
2024-02-16 09:32:10.624595-05:00 P 33.86 400 52983525421904 [ ] A
2024-02-16 09:32:10.625001-05:00 K 33.86 400 52983525100399 [ ] A
2024-02-16 09:32:10.625151-05:00 T 33.86 100 62879199619393 [ ] A
2024-02-16 09:32:10.625157-05:00 T 33.86 150 62879199619395 [ ] A
2024-02-16 09:32:10.625157-05:00 T 33.86 400 62879199619396 [ ] A

839 rows × 6 columns

In [14]:
ohlcv_df
Out[14]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
open high low close volume trades updated vwap buyvolume sellvolume
time
2024-02-15 15:58:00-05:00 34.0450 34.0499 34.0400 34.045 7032.0 19.0 2024-02-15 15:58:01.013622-05:00 34.045109 1976.0 1663.0
2024-02-15 15:58:01-05:00 34.0450 34.0500 34.0401 34.045 9216.0 39.0 2024-02-15 15:58:02.103157-05:00 34.048374 600.0 733.0
2024-02-15 15:58:02-05:00 34.0499 34.0500 34.0401 34.050 2404.0 20.0 2024-02-15 15:58:03.139942-05:00 34.048829 622.0 382.0
2024-02-15 15:58:03-05:00 34.0499 34.0500 34.0425 34.050 2481.0 11.0 2024-02-15 15:58:04.047101-05:00 34.046025 501.0 565.0
2024-02-15 15:58:04-05:00 34.0500 34.0599 34.0500 34.055 21532.0 57.0 2024-02-15 15:58:05.143236-05:00 34.051678 1465.0 1637.0
... ... ... ... ... ... ... ... ... ... ...
2024-02-16 09:34:52-05:00 33.7050 33.7050 33.7050 33.705 100.0 1.0 2024-02-16 09:34:53.669980-05:00 33.705000 0.0 100.0
2024-02-16 09:34:53-05:00 33.7050 33.7050 33.7050 33.705 300.0 1.0 2024-02-16 09:34:55.212184-05:00 33.705000 0.0 0.0
2024-02-16 09:34:55-05:00 33.7092 33.7092 33.7050 33.705 304.0 2.0 2024-02-16 09:34:56.304454-05:00 33.706382 100.0 204.0
2024-02-16 09:34:56-05:00 33.7050 33.7150 33.7050 33.715 900.0 9.0 2024-02-16 09:34:58.166409-05:00 33.711667 200.0 0.0
2024-02-16 09:34:58-05:00 33.7150 33.7150 33.7100 33.710 2461.0 14.0 2024-02-16 09:34:58.958611-05:00 33.710707 100.0 200.0

324 rows × 10 columns

In [13]:
#trades_df.info()
focused = trades_df.loc[("BAC", "2024-02-16 09:30:00"):("BAC", "2024-02-16 10:24:26")]
focused
Out[13]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-02-16 09:30:00.708657-05:00 P 33.920 200 52983525230899 [ ] A
2024-02-16 09:30:00.708657-05:00 P 33.920 200 52983525230900 [ , Q] A
2024-02-16 09:30:00.708660-05:00 P 33.920 200 52983525230901 [ ] A
2024-02-16 09:30:00.708687-05:00 P 33.920 200 52983525230903 [ ] A
2024-02-16 09:30:00.708744-05:00 P 33.920 100 52983525230904 [ ] A
... ... ... ... ... ... ...
2024-02-16 10:24:25.203728-05:00 D 33.930 434 71679923009363 [ ] A
2024-02-16 10:24:25.204664-05:00 D 33.930 118 71679923009364 [ ] A
2024-02-16 10:24:25.269020-05:00 D 33.930 580 71679923009601 [ ] A
2024-02-16 10:24:25.296243-05:00 D 33.930 325 71679923009612 [ ] A
2024-02-16 10:24:26.092502-05:00 N 33.925 275 52983526543427 [ ] A

12032 rows × 6 columns

In [10]:
trades_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"]
Out[10]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
In [9]:
focohlc = ohlcv_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"]
focohlc
Out[9]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
open high low close volume trades updated vwap buyvolume sellvolume
time
2024-02-16 09:30:00-05:00 33.920 33.92 33.900 33.900 1700.0 12.0 2024-02-16 09:30:01.023464-05:00 33.915882 0.0 400.0
2024-02-16 09:30:01-05:00 33.870 33.96 33.860 33.960 931216.0 110.0 2024-02-16 09:30:02.118683-05:00 33.949280 912802.0 4471.0
2024-02-16 09:30:02-05:00 33.960 33.98 33.945 33.945 1160.0 5.0 2024-02-16 09:30:03.440994-05:00 33.959655 180.0 300.0
2024-02-16 09:30:03-05:00 33.930 33.95 33.921 33.925 5376.0 13.0 2024-02-16 09:30:04.515116-05:00 33.934600 1548.0 991.0
2024-02-16 09:30:04-05:00 33.900 33.94 33.870 33.890 4759.0 13.0 2024-02-16 09:30:05.163964-05:00 33.874182 570.0 2889.0
... ... ... ... ... ... ... ... ... ... ...
2024-02-16 10:24:22-05:00 33.930 33.93 33.930 33.930 1626.0 8.0 2024-02-16 10:24:23.246513-05:00 33.930000 0.0 0.0
2024-02-16 10:24:23-05:00 33.930 33.93 33.925 33.930 2200.0 12.0 2024-02-16 10:24:24.061319-05:00 33.929545 400.0 200.0
2024-02-16 10:24:24-05:00 33.930 33.93 33.930 33.930 5054.0 10.0 2024-02-16 10:24:25.017519-05:00 33.930000 0.0 0.0
2024-02-16 10:24:25-05:00 33.930 33.93 33.930 33.930 3712.0 8.0 2024-02-16 10:24:26.092502-05:00 33.930000 0.0 0.0
2024-02-16 10:24:26-05:00 33.925 33.93 33.915 33.920 7457.0 33.0 2024-02-16 10:24:27.112140-05:00 33.920636 699.0 1461.0

2077 rows × 10 columns

In [8]:
focohlc.info()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 focohlc.info()

NameError: name 'focohlc' is not defined
In [5]:
#trades_df.to_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet", engine='pyarrow', compression='gzip')
#trades_df = pd.read_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet",engine='pyarrow')

#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
In [6]:
trades_df
Out[6]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-01-02 09:30:00.329729-05:00 T 33.390 9684 62879149669684 [ , Q] A
2024-01-02 09:30:00.758040-05:00 P 33.430 700 52983525495600 [ ] A
2024-01-02 09:30:00.758997-05:00 P 33.440 400 52983525495601 [ ] A
2024-01-02 09:30:01.086662-05:00 N 33.460 429483 52983525028681 [ , Q] A
2024-01-02 09:30:01.086708-05:00 P 33.450 100 52983525495863 [ ] A
... ... ... ... ... ... ...
2024-05-14 15:59:59.976990-04:00 N 38.490 42501 52983576482130 [ ] A
2024-05-14 15:59:59.977038-04:00 N 38.495 100 52983576482132 [ ] A
2024-05-14 15:59:59.977065-04:00 N 38.495 140 52983576482133 [ ] A
2024-05-14 15:59:59.977618-04:00 T 38.495 140 62883460167226 [ ] A
2024-05-14 15:59:59.978364-04:00 D 38.495 100 71714533561907 [ ] A

6467196 rows × 6 columns

In [4]:
file_trades
Out[4]:
'/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet'
In [ ]:
#list all files is dir directory with parquet extension
dir = DATA_DIR + "/notebooks/"
import os
files = [f for f in os.listdir(dir) if f.endswith(".parquet")]
file_name = ""
ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')
In [ ]:
ohlcv_df
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
# Calculate daily returns
ohlcv_df['returns'] = ohlcv_df['close'].pct_change().dropna()
#same as above but pct_change is from 3 datapoints back, but only if it is the same date, else na


# Plot the probability distribution curve
plt.figure(figsize=(10, 6))
sns.histplot(df['returns'].dropna(), kde=True, stat='probability', bins=30)
plt.title('Probability Distribution of Daily Returns')
plt.xlabel('Daily Returns')
plt.ylabel('Probability')
plt.show()
In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the intervals from 5 to 20 s, returns for each interval
#maybe use rolling window?
intervals = range(5, 21, 5)

# Create columns for percentage returns
rolling_window = 50

# Normalize the returns using rolling mean and std
for N in intervals:
    column_name = f'returns_{N}'
    rolling_mean = ohlcv_df[column_name].rolling(window=rolling_window).mean()
    rolling_std = ohlcv_df[column_name].rolling(window=rolling_window).std()
    ohlcv_df[f'norm_{column_name}'] = (ohlcv_df[column_name] - rolling_mean) / rolling_std

# Display the dataframe with normalized return columns
ohlcv_df
In [ ]:
# Calculate the sum of the normalized return columns for each row
ohlcv_df['sum_norm_returns'] = ohlcv_df[[f'norm_returns_{N}' for N in intervals]].sum(axis=1)

# Sort the DataFrame based on the sum of normalized returns in descending order
df_sorted = ohlcv_df.sort_values(by='sum_norm_returns', ascending=False)

# Display the top rows with the highest sum of normalized returns
df_sorted
In [ ]:
# Drop initial rows with NaN values due to pct_change
ohlcv_df.dropna(inplace=True)

# Plotting the probability distribution curves
plt.figure(figsize=(14, 8))
for N in intervals:
    sns.kdeplot(ohlcv_df[f'returns_{N}'].dropna(), label=f'Returns {N}', fill=True)

plt.title('Probability Distribution of Percentage Returns')
plt.xlabel('Percentage Return')
plt.ylabel('Density')
plt.legend()
plt.show()
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
# Plot the probability distribution curve
plt.figure(figsize=(10, 6))
sns.histplot(ohlcv_df['returns'].dropna(), kde=True, stat='probability', bins=30)
plt.title('Probability Distribution of Daily Returns')
plt.xlabel('Daily Returns')
plt.ylabel('Probability')
plt.show()
In [ ]:
#show only rows from ohlcv_df where returns > 0.005
ohlcv_df[ohlcv_df['returns'] > 0.0005]

#ohlcv_df[ohlcv_df['returns'] < -0.005]
In [ ]:
#ohlcv where index = date 2024-03-13 and between hour 12

a = ohlcv_df.loc['2024-03-13 12:00:00':'2024-03-13 13:00:00']
a
In [ ]:
ohlcv_df
In [ ]:
trades_df
In [ ]:
ohlcv_df.info()
In [ ]:
trades_df.to_parquet("trades_df-spy-0111-0111.parquett", engine='pyarrow', compression='gzip')
In [ ]:
trades_df.to_parquet("trades_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip', allow_truncated_timestamps=True)
In [ ]:
ohlcv_df.to_parquet("ohlcv_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip')
In [ ]:
basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)
vbt.settings['plotting']['auto_rangebreaks'] = True
basic_data.ohlcv.plot()
In [ ]:
#access just BCA
#df_filtered = df.loc["BAC"]