57 KiB
57 KiB
Loading trades and vectorized aggregation¶
This notebook fetches the trades from remote or local cache and aggregates them to bars of given type (time, volume, dollar) and resolution
fetch_trades_parallel enables to fetch trades of given symbol and interval, also can filter conditions and minimum size. return trades_df
aggregate_trades acceptss trades_df and ressolution and type of bars (VOLUME, TIME, DOLLAR) and return aggregated ohlcv dataframe ohlcv_df
In [20]:
import pandas as pd import numpy as np from numba import jit from alpaca.data.historical import StockHistoricalDataClient from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR from alpaca.data.requests import StockTradesRequest from v2realbot.enums.enums import BarType import time from datetime import datetime from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data import pyarrow from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades import vectorbtpro as vbt import v2realbot.utils.config_handler as cfh vbt.settings.set_theme("dark") vbt.settings['plotting']['layout']['width'] = 1280 vbt.settings.plotting.auto_rangebreaks = True # Set the option to display with pagination pd.set_option('display.notebook_repr_html', True) pd.set_option('display.max_rows', 20) # Number of rows per page # pd.set_option('display.float_format', '{:.9f}'.format) #trade filtering exclude_conditions = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES') #standard ['C','O','4','B','7','V','P','W','U','Z','F'] minsize = 100 symbol = "BAC" #datetime in zoneNY day_start = datetime(2023, 1, 1, 9, 30, 0) day_stop = datetime(2024, 5, 25, 15, 30, 0) day_start = zoneNY.localize(day_start) day_stop = zoneNY.localize(day_stop) #filename of trades_df parquet, date are in isoformat but without time zone part dir = DATA_DIR + "/notebooks/" #parquet interval cache contains exclude conditions and minsize filtering file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet" #file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}.parquet" file_ohlcv = dir + f"ohlcv_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet" print(file_trades) print(file_ohlcv) #PRINT all parquet in directory import os files = [f for f in os.listdir(dir) if f.endswith(".parquet")] for f in files: print(f)
/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet /Users/davidbrazda/Library/Application Support/v2realbot/notebooks/ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet trades_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet trades_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet ohlcv_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet ohlcv_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
In [18]:
#Either load trades or ohlcv from parquet if exists #trades_df = fetch_trades_parallel(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=minsize, max_workers=30) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F']) #trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip') #trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip') #filenames = [dir+"trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet",dir+"trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet"] trades_df = pd.read_parquet(dir+"trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet",engine='pyarrow') #focused = trades_df.loc["2024-02-16 11:23:11":"2024-02-16 11:24:26"] #focused ohlcv_df = aggregate_trades(symbol=symbol, trades_df=trades_df, resolution=1, type=BarType.TIME) ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow', compression='gzip') #ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow') # trades_df = pd.read_parquet(file_trades,engine='pyarrow')
In [19]:
ohlcv_df.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 4549772 entries, 2023-01-03 09:30:01-05:00 to 2024-05-24 15:59:59-04:00 Data columns (total 10 columns): # Column Dtype --- ------ ----- 0 open float64 1 high float64 2 low float64 3 close float64 4 volume float64 5 trades float64 6 updated datetime64[ns, US/Eastern] 7 vwap float64 8 buyvolume float64 9 sellvolume float64 dtypes: datetime64[ns, US/Eastern](1), float64(9) memory usage: 381.8 MB
In [16]:
a = trades_df.loc[("BAC", "2024-02-16 09:30"):("BAC","2024-02-16 09:32:11")] a
Out[16]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | ||
|---|---|---|---|---|---|---|---|
| symbol | timestamp | ||||||
| BAC | 2024-02-16 09:30:00.708657-05:00 | P | 33.92 | 200 | 52983525230899 | [ ] | A |
| 2024-02-16 09:30:00.708657-05:00 | P | 33.92 | 200 | 52983525230900 | [ , Q] | A | |
| 2024-02-16 09:30:00.708660-05:00 | P | 33.92 | 200 | 52983525230901 | [ ] | A | |
| 2024-02-16 09:30:00.708687-05:00 | P | 33.92 | 200 | 52983525230903 | [ ] | A | |
| 2024-02-16 09:30:00.708744-05:00 | P | 33.92 | 100 | 52983525230904 | [ ] | A | |
| ... | ... | ... | ... | ... | ... | ... | |
| 2024-02-16 09:32:10.624595-05:00 | P | 33.86 | 400 | 52983525421904 | [ ] | A | |
| 2024-02-16 09:32:10.625001-05:00 | K | 33.86 | 400 | 52983525100399 | [ ] | A | |
| 2024-02-16 09:32:10.625151-05:00 | T | 33.86 | 100 | 62879199619393 | [ ] | A | |
| 2024-02-16 09:32:10.625157-05:00 | T | 33.86 | 150 | 62879199619395 | [ ] | A | |
| 2024-02-16 09:32:10.625157-05:00 | T | 33.86 | 400 | 62879199619396 | [ ] | A |
839 rows × 6 columns
In [14]:
ohlcv_df
Out[14]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| open | high | low | close | volume | trades | updated | vwap | buyvolume | sellvolume | |
|---|---|---|---|---|---|---|---|---|---|---|
| time | ||||||||||
| 2024-02-15 15:58:00-05:00 | 34.0450 | 34.0499 | 34.0400 | 34.045 | 7032.0 | 19.0 | 2024-02-15 15:58:01.013622-05:00 | 34.045109 | 1976.0 | 1663.0 |
| 2024-02-15 15:58:01-05:00 | 34.0450 | 34.0500 | 34.0401 | 34.045 | 9216.0 | 39.0 | 2024-02-15 15:58:02.103157-05:00 | 34.048374 | 600.0 | 733.0 |
| 2024-02-15 15:58:02-05:00 | 34.0499 | 34.0500 | 34.0401 | 34.050 | 2404.0 | 20.0 | 2024-02-15 15:58:03.139942-05:00 | 34.048829 | 622.0 | 382.0 |
| 2024-02-15 15:58:03-05:00 | 34.0499 | 34.0500 | 34.0425 | 34.050 | 2481.0 | 11.0 | 2024-02-15 15:58:04.047101-05:00 | 34.046025 | 501.0 | 565.0 |
| 2024-02-15 15:58:04-05:00 | 34.0500 | 34.0599 | 34.0500 | 34.055 | 21532.0 | 57.0 | 2024-02-15 15:58:05.143236-05:00 | 34.051678 | 1465.0 | 1637.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-02-16 09:34:52-05:00 | 33.7050 | 33.7050 | 33.7050 | 33.705 | 100.0 | 1.0 | 2024-02-16 09:34:53.669980-05:00 | 33.705000 | 0.0 | 100.0 |
| 2024-02-16 09:34:53-05:00 | 33.7050 | 33.7050 | 33.7050 | 33.705 | 300.0 | 1.0 | 2024-02-16 09:34:55.212184-05:00 | 33.705000 | 0.0 | 0.0 |
| 2024-02-16 09:34:55-05:00 | 33.7092 | 33.7092 | 33.7050 | 33.705 | 304.0 | 2.0 | 2024-02-16 09:34:56.304454-05:00 | 33.706382 | 100.0 | 204.0 |
| 2024-02-16 09:34:56-05:00 | 33.7050 | 33.7150 | 33.7050 | 33.715 | 900.0 | 9.0 | 2024-02-16 09:34:58.166409-05:00 | 33.711667 | 200.0 | 0.0 |
| 2024-02-16 09:34:58-05:00 | 33.7150 | 33.7150 | 33.7100 | 33.710 | 2461.0 | 14.0 | 2024-02-16 09:34:58.958611-05:00 | 33.710707 | 100.0 | 200.0 |
324 rows × 10 columns
In [13]:
#trades_df.info() focused = trades_df.loc[("BAC", "2024-02-16 09:30:00"):("BAC", "2024-02-16 10:24:26")] focused
Out[13]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | ||
|---|---|---|---|---|---|---|---|
| symbol | timestamp | ||||||
| BAC | 2024-02-16 09:30:00.708657-05:00 | P | 33.920 | 200 | 52983525230899 | [ ] | A |
| 2024-02-16 09:30:00.708657-05:00 | P | 33.920 | 200 | 52983525230900 | [ , Q] | A | |
| 2024-02-16 09:30:00.708660-05:00 | P | 33.920 | 200 | 52983525230901 | [ ] | A | |
| 2024-02-16 09:30:00.708687-05:00 | P | 33.920 | 200 | 52983525230903 | [ ] | A | |
| 2024-02-16 09:30:00.708744-05:00 | P | 33.920 | 100 | 52983525230904 | [ ] | A | |
| ... | ... | ... | ... | ... | ... | ... | |
| 2024-02-16 10:24:25.203728-05:00 | D | 33.930 | 434 | 71679923009363 | [ ] | A | |
| 2024-02-16 10:24:25.204664-05:00 | D | 33.930 | 118 | 71679923009364 | [ ] | A | |
| 2024-02-16 10:24:25.269020-05:00 | D | 33.930 | 580 | 71679923009601 | [ ] | A | |
| 2024-02-16 10:24:25.296243-05:00 | D | 33.930 | 325 | 71679923009612 | [ ] | A | |
| 2024-02-16 10:24:26.092502-05:00 | N | 33.925 | 275 | 52983526543427 | [ ] | A |
12032 rows × 6 columns
In [10]:
trades_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"]
Out[10]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | ||
|---|---|---|---|---|---|---|---|
| symbol | timestamp |
In [9]:
focohlc = ohlcv_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"] focohlc
Out[9]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| open | high | low | close | volume | trades | updated | vwap | buyvolume | sellvolume | |
|---|---|---|---|---|---|---|---|---|---|---|
| time | ||||||||||
| 2024-02-16 09:30:00-05:00 | 33.920 | 33.92 | 33.900 | 33.900 | 1700.0 | 12.0 | 2024-02-16 09:30:01.023464-05:00 | 33.915882 | 0.0 | 400.0 |
| 2024-02-16 09:30:01-05:00 | 33.870 | 33.96 | 33.860 | 33.960 | 931216.0 | 110.0 | 2024-02-16 09:30:02.118683-05:00 | 33.949280 | 912802.0 | 4471.0 |
| 2024-02-16 09:30:02-05:00 | 33.960 | 33.98 | 33.945 | 33.945 | 1160.0 | 5.0 | 2024-02-16 09:30:03.440994-05:00 | 33.959655 | 180.0 | 300.0 |
| 2024-02-16 09:30:03-05:00 | 33.930 | 33.95 | 33.921 | 33.925 | 5376.0 | 13.0 | 2024-02-16 09:30:04.515116-05:00 | 33.934600 | 1548.0 | 991.0 |
| 2024-02-16 09:30:04-05:00 | 33.900 | 33.94 | 33.870 | 33.890 | 4759.0 | 13.0 | 2024-02-16 09:30:05.163964-05:00 | 33.874182 | 570.0 | 2889.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-02-16 10:24:22-05:00 | 33.930 | 33.93 | 33.930 | 33.930 | 1626.0 | 8.0 | 2024-02-16 10:24:23.246513-05:00 | 33.930000 | 0.0 | 0.0 |
| 2024-02-16 10:24:23-05:00 | 33.930 | 33.93 | 33.925 | 33.930 | 2200.0 | 12.0 | 2024-02-16 10:24:24.061319-05:00 | 33.929545 | 400.0 | 200.0 |
| 2024-02-16 10:24:24-05:00 | 33.930 | 33.93 | 33.930 | 33.930 | 5054.0 | 10.0 | 2024-02-16 10:24:25.017519-05:00 | 33.930000 | 0.0 | 0.0 |
| 2024-02-16 10:24:25-05:00 | 33.930 | 33.93 | 33.930 | 33.930 | 3712.0 | 8.0 | 2024-02-16 10:24:26.092502-05:00 | 33.930000 | 0.0 | 0.0 |
| 2024-02-16 10:24:26-05:00 | 33.925 | 33.93 | 33.915 | 33.920 | 7457.0 | 33.0 | 2024-02-16 10:24:27.112140-05:00 | 33.920636 | 699.0 | 1461.0 |
2077 rows × 10 columns
In [8]:
focohlc.info()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[8], line 1 ----> 1 focohlc.info() NameError: name 'focohlc' is not defined
In [5]:
#trades_df.to_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet", engine='pyarrow', compression='gzip') #trades_df = pd.read_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet",engine='pyarrow') #trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
In [6]:
trades_df
Out[6]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | ||
|---|---|---|---|---|---|---|---|
| symbol | timestamp | ||||||
| BAC | 2024-01-02 09:30:00.329729-05:00 | T | 33.390 | 9684 | 62879149669684 | [ , Q] | A |
| 2024-01-02 09:30:00.758040-05:00 | P | 33.430 | 700 | 52983525495600 | [ ] | A | |
| 2024-01-02 09:30:00.758997-05:00 | P | 33.440 | 400 | 52983525495601 | [ ] | A | |
| 2024-01-02 09:30:01.086662-05:00 | N | 33.460 | 429483 | 52983525028681 | [ , Q] | A | |
| 2024-01-02 09:30:01.086708-05:00 | P | 33.450 | 100 | 52983525495863 | [ ] | A | |
| ... | ... | ... | ... | ... | ... | ... | |
| 2024-05-14 15:59:59.976990-04:00 | N | 38.490 | 42501 | 52983576482130 | [ ] | A | |
| 2024-05-14 15:59:59.977038-04:00 | N | 38.495 | 100 | 52983576482132 | [ ] | A | |
| 2024-05-14 15:59:59.977065-04:00 | N | 38.495 | 140 | 52983576482133 | [ ] | A | |
| 2024-05-14 15:59:59.977618-04:00 | T | 38.495 | 140 | 62883460167226 | [ ] | A | |
| 2024-05-14 15:59:59.978364-04:00 | D | 38.495 | 100 | 71714533561907 | [ ] | A |
6467196 rows × 6 columns
In [4]:
file_trades
Out[4]:
'/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet'
In [ ]:
#list all files is dir directory with parquet extension dir = DATA_DIR + "/notebooks/" import os files = [f for f in os.listdir(dir) if f.endswith(".parquet")] file_name = "" ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')
In [ ]:
ohlcv_df
In [ ]:
import matplotlib.pyplot as plt import seaborn as sns # Calculate daily returns ohlcv_df['returns'] = ohlcv_df['close'].pct_change().dropna() #same as above but pct_change is from 3 datapoints back, but only if it is the same date, else na # Plot the probability distribution curve plt.figure(figsize=(10, 6)) sns.histplot(df['returns'].dropna(), kde=True, stat='probability', bins=30) plt.title('Probability Distribution of Daily Returns') plt.xlabel('Daily Returns') plt.ylabel('Probability') plt.show()
In [ ]:
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # Define the intervals from 5 to 20 s, returns for each interval #maybe use rolling window? intervals = range(5, 21, 5) # Create columns for percentage returns rolling_window = 50 # Normalize the returns using rolling mean and std for N in intervals: column_name = f'returns_{N}' rolling_mean = ohlcv_df[column_name].rolling(window=rolling_window).mean() rolling_std = ohlcv_df[column_name].rolling(window=rolling_window).std() ohlcv_df[f'norm_{column_name}'] = (ohlcv_df[column_name] - rolling_mean) / rolling_std # Display the dataframe with normalized return columns ohlcv_df
In [ ]:
# Calculate the sum of the normalized return columns for each row ohlcv_df['sum_norm_returns'] = ohlcv_df[[f'norm_returns_{N}' for N in intervals]].sum(axis=1) # Sort the DataFrame based on the sum of normalized returns in descending order df_sorted = ohlcv_df.sort_values(by='sum_norm_returns', ascending=False) # Display the top rows with the highest sum of normalized returns df_sorted
In [ ]:
# Drop initial rows with NaN values due to pct_change ohlcv_df.dropna(inplace=True) # Plotting the probability distribution curves plt.figure(figsize=(14, 8)) for N in intervals: sns.kdeplot(ohlcv_df[f'returns_{N}'].dropna(), label=f'Returns {N}', fill=True) plt.title('Probability Distribution of Percentage Returns') plt.xlabel('Percentage Return') plt.ylabel('Density') plt.legend() plt.show()
In [ ]:
import matplotlib.pyplot as plt import seaborn as sns # Plot the probability distribution curve plt.figure(figsize=(10, 6)) sns.histplot(ohlcv_df['returns'].dropna(), kde=True, stat='probability', bins=30) plt.title('Probability Distribution of Daily Returns') plt.xlabel('Daily Returns') plt.ylabel('Probability') plt.show()
In [ ]:
#show only rows from ohlcv_df where returns > 0.005 ohlcv_df[ohlcv_df['returns'] > 0.0005] #ohlcv_df[ohlcv_df['returns'] < -0.005]
In [ ]:
#ohlcv where index = date 2024-03-13 and between hour 12 a = ohlcv_df.loc['2024-03-13 12:00:00':'2024-03-13 13:00:00'] a
In [ ]:
ohlcv_df
In [ ]:
trades_df
In [ ]:
ohlcv_df.info()
In [ ]:
trades_df.to_parquet("trades_df-spy-0111-0111.parquett", engine='pyarrow', compression='gzip')
In [ ]:
trades_df.to_parquet("trades_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip', allow_truncated_timestamps=True)
In [ ]:
ohlcv_df.to_parquet("ohlcv_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip')
In [ ]:
basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY) vbt.settings['plotting']['auto_rangebreaks'] = True basic_data.ohlcv.plot()
In [ ]:
#access just BCA #df_filtered = df.loc["BAC"]