Files
v2realbot/research/prepare_aggregatied_data.ipynb
2024-06-13 11:02:41 +02:00

57 KiB
Raw Permalink Blame History

Loading trades and vectorized aggregation

This notebook fetches the trades from remote or local cache and aggregates them to bars of given type (time, volume, dollar) and resolution

fetch_trades_parallel enables to fetch trades of given symbol and interval, also can filter conditions and minimum size. return trades_df aggregate_trades acceptss trades_df and ressolution and type of bars (VOLUME, TIME, DOLLAR) and return aggregated ohlcv dataframe ohlcv_df

In [20]:
import pandas as pd
import numpy as np
from numba import jit
from alpaca.data.historical import StockHistoricalDataClient
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
from alpaca.data.requests import StockTradesRequest
from v2realbot.enums.enums import BarType
import time
from datetime import datetime
from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data
import pyarrow
from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades
import vectorbtpro as vbt
import v2realbot.utils.config_handler as cfh

vbt.settings.set_theme("dark")
vbt.settings['plotting']['layout']['width'] = 1280
vbt.settings.plotting.auto_rangebreaks = True
# Set the option to display with pagination
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_rows', 20)  # Number of rows per page
# pd.set_option('display.float_format', '{:.9f}'.format)


#trade filtering
exclude_conditions = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES') #standard ['C','O','4','B','7','V','P','W','U','Z','F']
minsize = 100

symbol = "BAC"
#datetime in zoneNY 
day_start = datetime(2023, 1, 1, 9, 30, 0)
day_stop = datetime(2024, 5, 25, 15, 30, 0)
day_start = zoneNY.localize(day_start)
day_stop = zoneNY.localize(day_stop)
#filename of trades_df parquet, date are in isoformat but without time zone part
dir = DATA_DIR + "/notebooks/"
#parquet interval cache contains exclude conditions and minsize filtering
file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet"
#file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}.parquet"
file_ohlcv = dir + f"ohlcv_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet"
print(file_trades)
print(file_ohlcv)
#PRINT all parquet in directory
import os
files = [f for f in os.listdir(dir) if f.endswith(".parquet")]
for f in files:
    print(f)
/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet
trades_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet
trades_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet
trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet
trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet
ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet
ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
In [18]:
#Either load trades or ohlcv from parquet if exists
#trades_df = fetch_trades_parallel(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=minsize, max_workers=30) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])
#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
#filenames = [dir+"trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet",dir+"trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet"]
trades_df = pd.read_parquet(dir+"trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet",engine='pyarrow')
#focused = trades_df.loc["2024-02-16 11:23:11":"2024-02-16 11:24:26"]
#focused
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=trades_df, resolution=1, type=BarType.TIME)
ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow', compression='gzip')

#ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')
# trades_df = pd.read_parquet(file_trades,engine='pyarrow')
In [19]:
ohlcv_df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4549772 entries, 2023-01-03 09:30:01-05:00 to 2024-05-24 15:59:59-04:00
Data columns (total 10 columns):
 #   Column      Dtype                     
---  ------      -----                     
 0   open        float64                   
 1   high        float64                   
 2   low         float64                   
 3   close       float64                   
 4   volume      float64                   
 5   trades      float64                   
 6   updated     datetime64[ns, US/Eastern]
 7   vwap        float64                   
 8   buyvolume   float64                   
 9   sellvolume  float64                   
dtypes: datetime64[ns, US/Eastern](1), float64(9)
memory usage: 381.8 MB
In [16]:
a = trades_df.loc[("BAC", "2024-02-16 09:30"):("BAC","2024-02-16 09:32:11")]
a
Out[16]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-02-16 09:30:00.708657-05:00 P 33.92 200 52983525230899 [ ] A
2024-02-16 09:30:00.708657-05:00 P 33.92 200 52983525230900 [ , Q] A
2024-02-16 09:30:00.708660-05:00 P 33.92 200 52983525230901 [ ] A
2024-02-16 09:30:00.708687-05:00 P 33.92 200 52983525230903 [ ] A
2024-02-16 09:30:00.708744-05:00 P 33.92 100 52983525230904 [ ] A
... ... ... ... ... ... ...
2024-02-16 09:32:10.624595-05:00 P 33.86 400 52983525421904 [ ] A
2024-02-16 09:32:10.625001-05:00 K 33.86 400 52983525100399 [ ] A
2024-02-16 09:32:10.625151-05:00 T 33.86 100 62879199619393 [ ] A
2024-02-16 09:32:10.625157-05:00 T 33.86 150 62879199619395 [ ] A
2024-02-16 09:32:10.625157-05:00 T 33.86 400 62879199619396 [ ] A

839 rows × 6 columns

In [14]:
ohlcv_df
Out[14]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
open high low close volume trades updated vwap buyvolume sellvolume
time
2024-02-15 15:58:00-05:00 34.0450 34.0499 34.0400 34.045 7032.0 19.0 2024-02-15 15:58:01.013622-05:00 34.045109 1976.0 1663.0
2024-02-15 15:58:01-05:00 34.0450 34.0500 34.0401 34.045 9216.0 39.0 2024-02-15 15:58:02.103157-05:00 34.048374 600.0 733.0
2024-02-15 15:58:02-05:00 34.0499 34.0500 34.0401 34.050 2404.0 20.0 2024-02-15 15:58:03.139942-05:00 34.048829 622.0 382.0
2024-02-15 15:58:03-05:00 34.0499 34.0500 34.0425 34.050 2481.0 11.0 2024-02-15 15:58:04.047101-05:00 34.046025 501.0 565.0
2024-02-15 15:58:04-05:00 34.0500 34.0599 34.0500 34.055 21532.0 57.0 2024-02-15 15:58:05.143236-05:00 34.051678 1465.0 1637.0
... ... ... ... ... ... ... ... ... ... ...
2024-02-16 09:34:52-05:00 33.7050 33.7050 33.7050 33.705 100.0 1.0 2024-02-16 09:34:53.669980-05:00 33.705000 0.0 100.0
2024-02-16 09:34:53-05:00 33.7050 33.7050 33.7050 33.705 300.0 1.0 2024-02-16 09:34:55.212184-05:00 33.705000 0.0 0.0
2024-02-16 09:34:55-05:00 33.7092 33.7092 33.7050 33.705 304.0 2.0 2024-02-16 09:34:56.304454-05:00 33.706382 100.0 204.0
2024-02-16 09:34:56-05:00 33.7050 33.7150 33.7050 33.715 900.0 9.0 2024-02-16 09:34:58.166409-05:00 33.711667 200.0 0.0
2024-02-16 09:34:58-05:00 33.7150 33.7150 33.7100 33.710 2461.0 14.0 2024-02-16 09:34:58.958611-05:00 33.710707 100.0 200.0

324 rows × 10 columns

In [13]:
#trades_df.info()
focused = trades_df.loc[("BAC", "2024-02-16 09:30:00"):("BAC", "2024-02-16 10:24:26")]
focused
Out[13]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-02-16 09:30:00.708657-05:00 P 33.920 200 52983525230899 [ ] A
2024-02-16 09:30:00.708657-05:00 P 33.920 200 52983525230900 [ , Q] A
2024-02-16 09:30:00.708660-05:00 P 33.920 200 52983525230901 [ ] A
2024-02-16 09:30:00.708687-05:00 P 33.920 200 52983525230903 [ ] A
2024-02-16 09:30:00.708744-05:00 P 33.920 100 52983525230904 [ ] A
... ... ... ... ... ... ...
2024-02-16 10:24:25.203728-05:00 D 33.930 434 71679923009363 [ ] A
2024-02-16 10:24:25.204664-05:00 D 33.930 118 71679923009364 [ ] A
2024-02-16 10:24:25.269020-05:00 D 33.930 580 71679923009601 [ ] A
2024-02-16 10:24:25.296243-05:00 D 33.930 325 71679923009612 [ ] A
2024-02-16 10:24:26.092502-05:00 N 33.925 275 52983526543427 [ ] A

12032 rows × 6 columns

In [10]:
trades_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"]
Out[10]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
In [9]:
focohlc = ohlcv_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"]
focohlc
Out[9]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
open high low close volume trades updated vwap buyvolume sellvolume
time
2024-02-16 09:30:00-05:00 33.920 33.92 33.900 33.900 1700.0 12.0 2024-02-16 09:30:01.023464-05:00 33.915882 0.0 400.0
2024-02-16 09:30:01-05:00 33.870 33.96 33.860 33.960 931216.0 110.0 2024-02-16 09:30:02.118683-05:00 33.949280 912802.0 4471.0
2024-02-16 09:30:02-05:00 33.960 33.98 33.945 33.945 1160.0 5.0 2024-02-16 09:30:03.440994-05:00 33.959655 180.0 300.0
2024-02-16 09:30:03-05:00 33.930 33.95 33.921 33.925 5376.0 13.0 2024-02-16 09:30:04.515116-05:00 33.934600 1548.0 991.0
2024-02-16 09:30:04-05:00 33.900 33.94 33.870 33.890 4759.0 13.0 2024-02-16 09:30:05.163964-05:00 33.874182 570.0 2889.0
... ... ... ... ... ... ... ... ... ... ...
2024-02-16 10:24:22-05:00 33.930 33.93 33.930 33.930 1626.0 8.0 2024-02-16 10:24:23.246513-05:00 33.930000 0.0 0.0
2024-02-16 10:24:23-05:00 33.930 33.93 33.925 33.930 2200.0 12.0 2024-02-16 10:24:24.061319-05:00 33.929545 400.0 200.0
2024-02-16 10:24:24-05:00 33.930 33.93 33.930 33.930 5054.0 10.0 2024-02-16 10:24:25.017519-05:00 33.930000 0.0 0.0
2024-02-16 10:24:25-05:00 33.930 33.93 33.930 33.930 3712.0 8.0 2024-02-16 10:24:26.092502-05:00 33.930000 0.0 0.0
2024-02-16 10:24:26-05:00 33.925 33.93 33.915 33.920 7457.0 33.0 2024-02-16 10:24:27.112140-05:00 33.920636 699.0 1461.0

2077 rows × 10 columns

In [8]:
focohlc.info()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 focohlc.info()

NameError: name 'focohlc' is not defined
In [5]:
#trades_df.to_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet", engine='pyarrow', compression='gzip')
#trades_df = pd.read_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet",engine='pyarrow')

#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
In [6]:
trades_df
Out[6]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
symbol timestamp
BAC 2024-01-02 09:30:00.329729-05:00 T 33.390 9684 62879149669684 [ , Q] A
2024-01-02 09:30:00.758040-05:00 P 33.430 700 52983525495600 [ ] A
2024-01-02 09:30:00.758997-05:00 P 33.440 400 52983525495601 [ ] A
2024-01-02 09:30:01.086662-05:00 N 33.460 429483 52983525028681 [ , Q] A
2024-01-02 09:30:01.086708-05:00 P 33.450 100 52983525495863 [ ] A
... ... ... ... ... ... ...
2024-05-14 15:59:59.976990-04:00 N 38.490 42501 52983576482130 [ ] A
2024-05-14 15:59:59.977038-04:00 N 38.495 100 52983576482132 [ ] A
2024-05-14 15:59:59.977065-04:00 N 38.495 140 52983576482133 [ ] A
2024-05-14 15:59:59.977618-04:00 T 38.495 140 62883460167226 [ ] A
2024-05-14 15:59:59.978364-04:00 D 38.495 100 71714533561907 [ ] A

6467196 rows × 6 columns

In [4]:
file_trades
Out[4]:
'/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet'
In [ ]:
#list all files is dir directory with parquet extension
dir = DATA_DIR + "/notebooks/"
import os
files = [f for f in os.listdir(dir) if f.endswith(".parquet")]
file_name = ""
ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')
In [ ]:
ohlcv_df
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
# Calculate daily returns
ohlcv_df['returns'] = ohlcv_df['close'].pct_change().dropna()
#same as above but pct_change is from 3 datapoints back, but only if it is the same date, else na


# Plot the probability distribution curve
plt.figure(figsize=(10, 6))
sns.histplot(df['returns'].dropna(), kde=True, stat='probability', bins=30)
plt.title('Probability Distribution of Daily Returns')
plt.xlabel('Daily Returns')
plt.ylabel('Probability')
plt.show()
In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the intervals from 5 to 20 s, returns for each interval
#maybe use rolling window?
intervals = range(5, 21, 5)

# Create columns for percentage returns
rolling_window = 50

# Normalize the returns using rolling mean and std
for N in intervals:
    column_name = f'returns_{N}'
    rolling_mean = ohlcv_df[column_name].rolling(window=rolling_window).mean()
    rolling_std = ohlcv_df[column_name].rolling(window=rolling_window).std()
    ohlcv_df[f'norm_{column_name}'] = (ohlcv_df[column_name] - rolling_mean) / rolling_std

# Display the dataframe with normalized return columns
ohlcv_df
In [ ]:
# Calculate the sum of the normalized return columns for each row
ohlcv_df['sum_norm_returns'] = ohlcv_df[[f'norm_returns_{N}' for N in intervals]].sum(axis=1)

# Sort the DataFrame based on the sum of normalized returns in descending order
df_sorted = ohlcv_df.sort_values(by='sum_norm_returns', ascending=False)

# Display the top rows with the highest sum of normalized returns
df_sorted
In [ ]:
# Drop initial rows with NaN values due to pct_change
ohlcv_df.dropna(inplace=True)

# Plotting the probability distribution curves
plt.figure(figsize=(14, 8))
for N in intervals:
    sns.kdeplot(ohlcv_df[f'returns_{N}'].dropna(), label=f'Returns {N}', fill=True)

plt.title('Probability Distribution of Percentage Returns')
plt.xlabel('Percentage Return')
plt.ylabel('Density')
plt.legend()
plt.show()
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
# Plot the probability distribution curve
plt.figure(figsize=(10, 6))
sns.histplot(ohlcv_df['returns'].dropna(), kde=True, stat='probability', bins=30)
plt.title('Probability Distribution of Daily Returns')
plt.xlabel('Daily Returns')
plt.ylabel('Probability')
plt.show()
In [ ]:
#show only rows from ohlcv_df where returns > 0.005
ohlcv_df[ohlcv_df['returns'] > 0.0005]

#ohlcv_df[ohlcv_df['returns'] < -0.005]
In [ ]:
#ohlcv where index = date 2024-03-13 and between hour 12

a = ohlcv_df.loc['2024-03-13 12:00:00':'2024-03-13 13:00:00']
a
In [ ]:
ohlcv_df
In [ ]:
trades_df
In [ ]:
ohlcv_df.info()
In [ ]:
trades_df.to_parquet("trades_df-spy-0111-0111.parquett", engine='pyarrow', compression='gzip')
In [ ]:
trades_df.to_parquet("trades_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip', allow_truncated_timestamps=True)
In [ ]:
ohlcv_df.to_parquet("ohlcv_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip')
In [ ]:
basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)
vbt.settings['plotting']['auto_rangebreaks'] = True
basic_data.ohlcv.plot()
In [ ]:
#access just BCA
#df_filtered = df.loc["BAC"]