57 KiB
57 KiB
Loading trades and vectorized aggregation¶
Describes how to fetch trades (remote/cached) and use new vectorized aggregation to aggregate bars of given type (time, volume, dollar) and resolution
fetch_trades_parallel enables to fetch trades of given symbol and interval, also can filter conditions and minimum size. return trades_df
aggregate_trades acceptss trades_df and ressolution and type of bars (VOLUME, TIME, DOLLAR) and return aggregated ohlcv dataframe ohlcv_df
In [20]:
import pandas as pd import numpy as np from numba import jit from alpaca.data.historical import StockHistoricalDataClient from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR from alpaca.data.requests import StockTradesRequest from v2realbot.enums.enums import BarType import time from datetime import datetime from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data import pyarrow from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades import vectorbtpro as vbt import v2realbot.utils.config_handler as cfh vbt.settings.set_theme("dark") vbt.settings['plotting']['layout']['width'] = 1280 vbt.settings.plotting.auto_rangebreaks = True # Set the option to display with pagination pd.set_option('display.notebook_repr_html', True) pd.set_option('display.max_rows', 20) # Number of rows per page # pd.set_option('display.float_format', '{:.9f}'.format) #trade filtering exclude_conditions = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES') #standard ['C','O','4','B','7','V','P','W','U','Z','F'] minsize = 100 symbol = "BAC" #datetime in zoneNY day_start = datetime(2023, 1, 1, 9, 30, 0) day_stop = datetime(2024, 5, 25, 15, 30, 0) day_start = zoneNY.localize(day_start) day_stop = zoneNY.localize(day_stop) #filename of trades_df parquet, date are in isoformat but without time zone part dir = DATA_DIR + "/notebooks/" #parquet interval cache contains exclude conditions and minsize filtering file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet" #file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}.parquet" file_ohlcv = dir + f"ohlcv_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet" print(file_trades) print(file_ohlcv) #PRINT all parquet in directory import os files = [f for f in os.listdir(dir) if f.endswith(".parquet")] for f in files: print(f)
/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet /Users/davidbrazda/Library/Application Support/v2realbot/notebooks/ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet trades_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet trades_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet ohlcv_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet ohlcv_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
In [18]:
#Either load trades or ohlcv from parquet if exists #trades_df = fetch_trades_parallel(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=minsize, max_workers=30) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F']) #trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip') #trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip') #filenames = [dir+"trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet",dir+"trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet"] trades_df = pd.read_parquet(dir+"trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet",engine='pyarrow') #focused = trades_df.loc["2024-02-16 11:23:11":"2024-02-16 11:24:26"] #focused ohlcv_df = aggregate_trades(symbol=symbol, trades_df=trades_df, resolution=1, type=BarType.TIME) ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow', compression='gzip') #ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow') # trades_df = pd.read_parquet(file_trades,engine='pyarrow')
In [19]:
ohlcv_df.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 4549772 entries, 2023-01-03 09:30:01-05:00 to 2024-05-24 15:59:59-04:00 Data columns (total 10 columns): # Column Dtype --- ------ ----- 0 open float64 1 high float64 2 low float64 3 close float64 4 volume float64 5 trades float64 6 updated datetime64[ns, US/Eastern] 7 vwap float64 8 buyvolume float64 9 sellvolume float64 dtypes: datetime64[ns, US/Eastern](1), float64(9) memory usage: 381.8 MB
In [16]:
a = trades_df.loc[("BAC", "2024-02-16 09:30"):("BAC","2024-02-16 09:32:11")] a
Out[16]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | ||
|---|---|---|---|---|---|---|---|
| symbol | timestamp | ||||||
| BAC | 2024-02-16 09:30:00.708657-05:00 | P | 33.92 | 200 | 52983525230899 | [ ] | A |
| 2024-02-16 09:30:00.708657-05:00 | P | 33.92 | 200 | 52983525230900 | [ , Q] | A | |
| 2024-02-16 09:30:00.708660-05:00 | P | 33.92 | 200 | 52983525230901 | [ ] | A | |
| 2024-02-16 09:30:00.708687-05:00 | P | 33.92 | 200 | 52983525230903 | [ ] | A | |
| 2024-02-16 09:30:00.708744-05:00 | P | 33.92 | 100 | 52983525230904 | [ ] | A | |
| ... | ... | ... | ... | ... | ... | ... | |
| 2024-02-16 09:32:10.624595-05:00 | P | 33.86 | 400 | 52983525421904 | [ ] | A | |
| 2024-02-16 09:32:10.625001-05:00 | K | 33.86 | 400 | 52983525100399 | [ ] | A | |
| 2024-02-16 09:32:10.625151-05:00 | T | 33.86 | 100 | 62879199619393 | [ ] | A | |
| 2024-02-16 09:32:10.625157-05:00 | T | 33.86 | 150 | 62879199619395 | [ ] | A | |
| 2024-02-16 09:32:10.625157-05:00 | T | 33.86 | 400 | 62879199619396 | [ ] | A |
839 rows × 6 columns
In [14]:
ohlcv_df
Out[14]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| open | high | low | close | volume | trades | updated | vwap | buyvolume | sellvolume | |
|---|---|---|---|---|---|---|---|---|---|---|
| time | ||||||||||
| 2024-02-15 15:58:00-05:00 | 34.0450 | 34.0499 | 34.0400 | 34.045 | 7032.0 | 19.0 | 2024-02-15 15:58:01.013622-05:00 | 34.045109 | 1976.0 | 1663.0 |
| 2024-02-15 15:58:01-05:00 | 34.0450 | 34.0500 | 34.0401 | 34.045 | 9216.0 | 39.0 | 2024-02-15 15:58:02.103157-05:00 | 34.048374 | 600.0 | 733.0 |
| 2024-02-15 15:58:02-05:00 | 34.0499 | 34.0500 | 34.0401 | 34.050 | 2404.0 | 20.0 | 2024-02-15 15:58:03.139942-05:00 | 34.048829 | 622.0 | 382.0 |
| 2024-02-15 15:58:03-05:00 | 34.0499 | 34.0500 | 34.0425 | 34.050 | 2481.0 | 11.0 | 2024-02-15 15:58:04.047101-05:00 | 34.046025 | 501.0 | 565.0 |
| 2024-02-15 15:58:04-05:00 | 34.0500 | 34.0599 | 34.0500 | 34.055 | 21532.0 | 57.0 | 2024-02-15 15:58:05.143236-05:00 | 34.051678 | 1465.0 | 1637.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-02-16 09:34:52-05:00 | 33.7050 | 33.7050 | 33.7050 | 33.705 | 100.0 | 1.0 | 2024-02-16 09:34:53.669980-05:00 | 33.705000 | 0.0 | 100.0 |
| 2024-02-16 09:34:53-05:00 | 33.7050 | 33.7050 | 33.7050 | 33.705 | 300.0 | 1.0 | 2024-02-16 09:34:55.212184-05:00 | 33.705000 | 0.0 | 0.0 |
| 2024-02-16 09:34:55-05:00 | 33.7092 | 33.7092 | 33.7050 | 33.705 | 304.0 | 2.0 | 2024-02-16 09:34:56.304454-05:00 | 33.706382 | 100.0 | 204.0 |
| 2024-02-16 09:34:56-05:00 | 33.7050 | 33.7150 | 33.7050 | 33.715 | 900.0 | 9.0 | 2024-02-16 09:34:58.166409-05:00 | 33.711667 | 200.0 | 0.0 |
| 2024-02-16 09:34:58-05:00 | 33.7150 | 33.7150 | 33.7100 | 33.710 | 2461.0 | 14.0 | 2024-02-16 09:34:58.958611-05:00 | 33.710707 | 100.0 | 200.0 |
324 rows × 10 columns
In [13]:
#trades_df.info() focused = trades_df.loc[("BAC", "2024-02-16 09:30:00"):("BAC", "2024-02-16 10:24:26")] focused
Out[13]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | ||
|---|---|---|---|---|---|---|---|
| symbol | timestamp | ||||||
| BAC | 2024-02-16 09:30:00.708657-05:00 | P | 33.920 | 200 | 52983525230899 | [ ] | A |
| 2024-02-16 09:30:00.708657-05:00 | P | 33.920 | 200 | 52983525230900 | [ , Q] | A | |
| 2024-02-16 09:30:00.708660-05:00 | P | 33.920 | 200 | 52983525230901 | [ ] | A | |
| 2024-02-16 09:30:00.708687-05:00 | P | 33.920 | 200 | 52983525230903 | [ ] | A | |
| 2024-02-16 09:30:00.708744-05:00 | P | 33.920 | 100 | 52983525230904 | [ ] | A | |
| ... | ... | ... | ... | ... | ... | ... | |
| 2024-02-16 10:24:25.203728-05:00 | D | 33.930 | 434 | 71679923009363 | [ ] | A | |
| 2024-02-16 10:24:25.204664-05:00 | D | 33.930 | 118 | 71679923009364 | [ ] | A | |
| 2024-02-16 10:24:25.269020-05:00 | D | 33.930 | 580 | 71679923009601 | [ ] | A | |
| 2024-02-16 10:24:25.296243-05:00 | D | 33.930 | 325 | 71679923009612 | [ ] | A | |
| 2024-02-16 10:24:26.092502-05:00 | N | 33.925 | 275 | 52983526543427 | [ ] | A |
12032 rows × 6 columns
In [10]:
trades_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"]
Out[10]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | ||
|---|---|---|---|---|---|---|---|
| symbol | timestamp |
In [9]:
focohlc = ohlcv_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"] focohlc
Out[9]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| open | high | low | close | volume | trades | updated | vwap | buyvolume | sellvolume | |
|---|---|---|---|---|---|---|---|---|---|---|
| time | ||||||||||
| 2024-02-16 09:30:00-05:00 | 33.920 | 33.92 | 33.900 | 33.900 | 1700.0 | 12.0 | 2024-02-16 09:30:01.023464-05:00 | 33.915882 | 0.0 | 400.0 |
| 2024-02-16 09:30:01-05:00 | 33.870 | 33.96 | 33.860 | 33.960 | 931216.0 | 110.0 | 2024-02-16 09:30:02.118683-05:00 | 33.949280 | 912802.0 | 4471.0 |
| 2024-02-16 09:30:02-05:00 | 33.960 | 33.98 | 33.945 | 33.945 | 1160.0 | 5.0 | 2024-02-16 09:30:03.440994-05:00 | 33.959655 | 180.0 | 300.0 |
| 2024-02-16 09:30:03-05:00 | 33.930 | 33.95 | 33.921 | 33.925 | 5376.0 | 13.0 | 2024-02-16 09:30:04.515116-05:00 | 33.934600 | 1548.0 | 991.0 |
| 2024-02-16 09:30:04-05:00 | 33.900 | 33.94 | 33.870 | 33.890 | 4759.0 | 13.0 | 2024-02-16 09:30:05.163964-05:00 | 33.874182 | 570.0 | 2889.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-02-16 10:24:22-05:00 | 33.930 | 33.93 | 33.930 | 33.930 | 1626.0 | 8.0 | 2024-02-16 10:24:23.246513-05:00 | 33.930000 | 0.0 | 0.0 |
| 2024-02-16 10:24:23-05:00 | 33.930 | 33.93 | 33.925 | 33.930 | 2200.0 | 12.0 | 2024-02-16 10:24:24.061319-05:00 | 33.929545 | 400.0 | 200.0 |
| 2024-02-16 10:24:24-05:00 | 33.930 | 33.93 | 33.930 | 33.930 | 5054.0 | 10.0 | 2024-02-16 10:24:25.017519-05:00 | 33.930000 | 0.0 | 0.0 |
| 2024-02-16 10:24:25-05:00 | 33.930 | 33.93 | 33.930 | 33.930 | 3712.0 | 8.0 | 2024-02-16 10:24:26.092502-05:00 | 33.930000 | 0.0 | 0.0 |
| 2024-02-16 10:24:26-05:00 | 33.925 | 33.93 | 33.915 | 33.920 | 7457.0 | 33.0 | 2024-02-16 10:24:27.112140-05:00 | 33.920636 | 699.0 | 1461.0 |
2077 rows × 10 columns
In [8]:
focohlc.info()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[8], line 1 ----> 1 focohlc.info() NameError: name 'focohlc' is not defined
In [5]:
#trades_df.to_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet", engine='pyarrow', compression='gzip') #trades_df = pd.read_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet",engine='pyarrow') #trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
In [6]:
trades_df
Out[6]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | ||
|---|---|---|---|---|---|---|---|
| symbol | timestamp | ||||||
| BAC | 2024-01-02 09:30:00.329729-05:00 | T | 33.390 | 9684 | 62879149669684 | [ , Q] | A |
| 2024-01-02 09:30:00.758040-05:00 | P | 33.430 | 700 | 52983525495600 | [ ] | A | |
| 2024-01-02 09:30:00.758997-05:00 | P | 33.440 | 400 | 52983525495601 | [ ] | A | |
| 2024-01-02 09:30:01.086662-05:00 | N | 33.460 | 429483 | 52983525028681 | [ , Q] | A | |
| 2024-01-02 09:30:01.086708-05:00 | P | 33.450 | 100 | 52983525495863 | [ ] | A | |
| ... | ... | ... | ... | ... | ... | ... | |
| 2024-05-14 15:59:59.976990-04:00 | N | 38.490 | 42501 | 52983576482130 | [ ] | A | |
| 2024-05-14 15:59:59.977038-04:00 | N | 38.495 | 100 | 52983576482132 | [ ] | A | |
| 2024-05-14 15:59:59.977065-04:00 | N | 38.495 | 140 | 52983576482133 | [ ] | A | |
| 2024-05-14 15:59:59.977618-04:00 | T | 38.495 | 140 | 62883460167226 | [ ] | A | |
| 2024-05-14 15:59:59.978364-04:00 | D | 38.495 | 100 | 71714533561907 | [ ] | A |
6467196 rows × 6 columns
In [4]:
file_trades
Out[4]:
'/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet'
In [ ]:
#list all files is dir directory with parquet extension dir = DATA_DIR + "/notebooks/" import os files = [f for f in os.listdir(dir) if f.endswith(".parquet")] file_name = "" ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')
In [ ]:
ohlcv_df
In [ ]:
import matplotlib.pyplot as plt import seaborn as sns # Calculate daily returns ohlcv_df['returns'] = ohlcv_df['close'].pct_change().dropna() #same as above but pct_change is from 3 datapoints back, but only if it is the same date, else na # Plot the probability distribution curve plt.figure(figsize=(10, 6)) sns.histplot(df['returns'].dropna(), kde=True, stat='probability', bins=30) plt.title('Probability Distribution of Daily Returns') plt.xlabel('Daily Returns') plt.ylabel('Probability') plt.show()
In [ ]:
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # Define the intervals from 5 to 20 s, returns for each interval #maybe use rolling window? intervals = range(5, 21, 5) # Create columns for percentage returns rolling_window = 50 # Normalize the returns using rolling mean and std for N in intervals: column_name = f'returns_{N}' rolling_mean = ohlcv_df[column_name].rolling(window=rolling_window).mean() rolling_std = ohlcv_df[column_name].rolling(window=rolling_window).std() ohlcv_df[f'norm_{column_name}'] = (ohlcv_df[column_name] - rolling_mean) / rolling_std # Display the dataframe with normalized return columns ohlcv_df
In [ ]:
# Calculate the sum of the normalized return columns for each row ohlcv_df['sum_norm_returns'] = ohlcv_df[[f'norm_returns_{N}' for N in intervals]].sum(axis=1) # Sort the DataFrame based on the sum of normalized returns in descending order df_sorted = ohlcv_df.sort_values(by='sum_norm_returns', ascending=False) # Display the top rows with the highest sum of normalized returns df_sorted
In [ ]:
# Drop initial rows with NaN values due to pct_change ohlcv_df.dropna(inplace=True) # Plotting the probability distribution curves plt.figure(figsize=(14, 8)) for N in intervals: sns.kdeplot(ohlcv_df[f'returns_{N}'].dropna(), label=f'Returns {N}', fill=True) plt.title('Probability Distribution of Percentage Returns') plt.xlabel('Percentage Return') plt.ylabel('Density') plt.legend() plt.show()
In [ ]:
import matplotlib.pyplot as plt import seaborn as sns # Plot the probability distribution curve plt.figure(figsize=(10, 6)) sns.histplot(ohlcv_df['returns'].dropna(), kde=True, stat='probability', bins=30) plt.title('Probability Distribution of Daily Returns') plt.xlabel('Daily Returns') plt.ylabel('Probability') plt.show()
In [ ]:
#show only rows from ohlcv_df where returns > 0.005 ohlcv_df[ohlcv_df['returns'] > 0.0005] #ohlcv_df[ohlcv_df['returns'] < -0.005]
In [ ]:
#ohlcv where index = date 2024-03-13 and between hour 12 a = ohlcv_df.loc['2024-03-13 12:00:00':'2024-03-13 13:00:00'] a
In [ ]:
ohlcv_df
In [ ]:
trades_df
In [ ]:
ohlcv_df.info()
In [ ]:
trades_df.to_parquet("trades_df-spy-0111-0111.parquett", engine='pyarrow', compression='gzip')
In [ ]:
trades_df.to_parquet("trades_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip', allow_truncated_timestamps=True)
In [ ]:
ohlcv_df.to_parquet("ohlcv_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip')
In [ ]:
basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY) vbt.settings['plotting']['auto_rangebreaks'] = True basic_data.ohlcv.plot()
In [ ]:
#access just BCA #df_filtered = df.loc["BAC"]