Files

David Brazda 702328a242 research added (#207 )

2024-06-13 11:02:41 +02:00

57 KiB

Raw Blame History

Loading trades and vectorized aggregation¶

This notebook fetches the trades from remote or local cache and aggregates them to bars of given type (time, volume, dollar) and resolution

fetch_trades_parallel enables to fetch trades of given symbol and interval, also can filter conditions and minimum size. return trades_df aggregate_trades acceptss trades_df and ressolution and type of bars (VOLUME, TIME, DOLLAR) and return aggregated ohlcv dataframe ohlcv_df

In [20]:

import pandas as pd
import numpy as np
from numba import jit
from alpaca.data.historical import StockHistoricalDataClient
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
from alpaca.data.requests import StockTradesRequest
from v2realbot.enums.enums import BarType
import time
from datetime import datetime
from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data
import pyarrow
from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades
import vectorbtpro as vbt
import v2realbot.utils.config_handler as cfh

vbt.settings.set_theme("dark")
vbt.settings['plotting']['layout']['width'] = 1280
vbt.settings.plotting.auto_rangebreaks = True
# Set the option to display with pagination
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_rows', 20)  # Number of rows per page
# pd.set_option('display.float_format', '{:.9f}'.format)


#trade filtering
exclude_conditions = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES') #standard ['C','O','4','B','7','V','P','W','U','Z','F']
minsize = 100

symbol = "BAC"
#datetime in zoneNY 
day_start = datetime(2023, 1, 1, 9, 30, 0)
day_stop = datetime(2024, 5, 25, 15, 30, 0)
day_start = zoneNY.localize(day_start)
day_stop = zoneNY.localize(day_stop)
#filename of trades_df parquet, date are in isoformat but without time zone part
dir = DATA_DIR + "/notebooks/"
#parquet interval cache contains exclude conditions and minsize filtering
file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet"
#file_trades = dir + f"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}.parquet"
file_ohlcv = dir + f"ohlcv_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet"
print(file_trades)
print(file_ohlcv)
#PRINT all parquet in directory
import os
files = [f for f in os.listdir(dir) if f.endswith(".parquet")]
for f in files:
    print(f)

/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet
trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet
trades_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet
trades_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet
trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet
trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet
ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet
ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet
ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet

In [18]:

#Either load trades or ohlcv from parquet if exists
#trades_df = fetch_trades_parallel(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=minsize, max_workers=30) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])
#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')
#filenames = [dir+"trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet",dir+"trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet"]
trades_df = pd.read_parquet(dir+"trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet",engine='pyarrow')
#focused = trades_df.loc["2024-02-16 11:23:11":"2024-02-16 11:24:26"]
#focused
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=trades_df, resolution=1, type=BarType.TIME)
ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow', compression='gzip')

#ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')
# trades_df = pd.read_parquet(file_trades,engine='pyarrow')

In [19]:

ohlcv_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4549772 entries, 2023-01-03 09:30:01-05:00 to 2024-05-24 15:59:59-04:00
Data columns (total 10 columns):
 #   Column      Dtype                     
---  ------      -----                     
 0   open        float64                   
 1   high        float64                   
 2   low         float64                   
 3   close       float64                   
 4   volume      float64                   
 5   trades      float64                   
 6   updated     datetime64[ns, US/Eastern]
 7   vwap        float64                   
 8   buyvolume   float64                   
 9   sellvolume  float64                   
dtypes: datetime64[ns, US/Eastern](1), float64(9)
memory usage: 381.8 MB

In [16]:

a = trades_df.loc[("BAC", "2024-02-16 09:30"):("BAC","2024-02-16 09:32:11")]
a

Out[16]:

		exchange	price	size	id	conditions	tape
symbol	timestamp
BAC	2024-02-16 09:30:00.708657-05:00	P	33.92	200	52983525230899	[ ]	A
	2024-02-16 09:30:00.708657-05:00	P	33.92	200	52983525230900	[ , Q]	A
	2024-02-16 09:30:00.708660-05:00	P	33.92	200	52983525230901	[ ]	A
	2024-02-16 09:30:00.708687-05:00	P	33.92	200	52983525230903	[ ]	A
	2024-02-16 09:30:00.708744-05:00	P	33.92	100	52983525230904	[ ]	A
	...	...	...	...	...	...	...
	2024-02-16 09:32:10.624595-05:00	P	33.86	400	52983525421904	[ ]	A
	2024-02-16 09:32:10.625001-05:00	K	33.86	400	52983525100399	[ ]	A
	2024-02-16 09:32:10.625151-05:00	T	33.86	100	62879199619393	[ ]	A
	2024-02-16 09:32:10.625157-05:00	T	33.86	150	62879199619395	[ ]	A
	2024-02-16 09:32:10.625157-05:00	T	33.86	400	62879199619396	[ ]	A

839 rows × 6 columns

In [14]:

ohlcv_df

Out[14]:

	open	high	low	close	volume	trades	updated	vwap	buyvolume	sellvolume
time
2024-02-15 15:58:00-05:00	34.0450	34.0499	34.0400	34.045	7032.0	19.0	2024-02-15 15:58:01.013622-05:00	34.045109	1976.0	1663.0
2024-02-15 15:58:01-05:00	34.0450	34.0500	34.0401	34.045	9216.0	39.0	2024-02-15 15:58:02.103157-05:00	34.048374	600.0	733.0
2024-02-15 15:58:02-05:00	34.0499	34.0500	34.0401	34.050	2404.0	20.0	2024-02-15 15:58:03.139942-05:00	34.048829	622.0	382.0
2024-02-15 15:58:03-05:00	34.0499	34.0500	34.0425	34.050	2481.0	11.0	2024-02-15 15:58:04.047101-05:00	34.046025	501.0	565.0
2024-02-15 15:58:04-05:00	34.0500	34.0599	34.0500	34.055	21532.0	57.0	2024-02-15 15:58:05.143236-05:00	34.051678	1465.0	1637.0
...	...	...	...	...	...	...	...	...	...	...
2024-02-16 09:34:52-05:00	33.7050	33.7050	33.7050	33.705	100.0	1.0	2024-02-16 09:34:53.669980-05:00	33.705000	0.0	100.0
2024-02-16 09:34:53-05:00	33.7050	33.7050	33.7050	33.705	300.0	1.0	2024-02-16 09:34:55.212184-05:00	33.705000	0.0	0.0
2024-02-16 09:34:55-05:00	33.7092	33.7092	33.7050	33.705	304.0	2.0	2024-02-16 09:34:56.304454-05:00	33.706382	100.0	204.0
2024-02-16 09:34:56-05:00	33.7050	33.7150	33.7050	33.715	900.0	9.0	2024-02-16 09:34:58.166409-05:00	33.711667	200.0	0.0
2024-02-16 09:34:58-05:00	33.7150	33.7150	33.7100	33.710	2461.0	14.0	2024-02-16 09:34:58.958611-05:00	33.710707	100.0	200.0

324 rows × 10 columns

In [13]:

#trades_df.info()
focused = trades_df.loc[("BAC", "2024-02-16 09:30:00"):("BAC", "2024-02-16 10:24:26")]
focused

Out[13]:

		exchange	price	size	id	conditions	tape
symbol	timestamp
BAC	2024-02-16 09:30:00.708657-05:00	P	33.920	200	52983525230899	[ ]	A
	2024-02-16 09:30:00.708657-05:00	P	33.920	200	52983525230900	[ , Q]	A
	2024-02-16 09:30:00.708660-05:00	P	33.920	200	52983525230901	[ ]	A
	2024-02-16 09:30:00.708687-05:00	P	33.920	200	52983525230903	[ ]	A
	2024-02-16 09:30:00.708744-05:00	P	33.920	100	52983525230904	[ ]	A
	...	...	...	...	...	...	...
	2024-02-16 10:24:25.203728-05:00	D	33.930	434	71679923009363	[ ]	A
	2024-02-16 10:24:25.204664-05:00	D	33.930	118	71679923009364	[ ]	A
	2024-02-16 10:24:25.269020-05:00	D	33.930	580	71679923009601	[ ]	A
	2024-02-16 10:24:25.296243-05:00	D	33.930	325	71679923009612	[ ]	A
	2024-02-16 10:24:26.092502-05:00	N	33.925	275	52983526543427	[ ]	A

12032 rows × 6 columns

In [10]:

trades_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"]

Out[10]:

		exchange	price	size	id	conditions	tape
symbol	timestamp

In [9]:

focohlc = ohlcv_df.loc["2024-02-16 09:30:00":"2024-02-16 10:24:26"]
focohlc

Out[9]:

	open	high	low	close	volume	trades	updated	vwap	buyvolume	sellvolume
time
2024-02-16 09:30:00-05:00	33.920	33.92	33.900	33.900	1700.0	12.0	2024-02-16 09:30:01.023464-05:00	33.915882	0.0	400.0
2024-02-16 09:30:01-05:00	33.870	33.96	33.860	33.960	931216.0	110.0	2024-02-16 09:30:02.118683-05:00	33.949280	912802.0	4471.0
2024-02-16 09:30:02-05:00	33.960	33.98	33.945	33.945	1160.0	5.0	2024-02-16 09:30:03.440994-05:00	33.959655	180.0	300.0
2024-02-16 09:30:03-05:00	33.930	33.95	33.921	33.925	5376.0	13.0	2024-02-16 09:30:04.515116-05:00	33.934600	1548.0	991.0
2024-02-16 09:30:04-05:00	33.900	33.94	33.870	33.890	4759.0	13.0	2024-02-16 09:30:05.163964-05:00	33.874182	570.0	2889.0
...	...	...	...	...	...	...	...	...	...	...
2024-02-16 10:24:22-05:00	33.930	33.93	33.930	33.930	1626.0	8.0	2024-02-16 10:24:23.246513-05:00	33.930000	0.0	0.0
2024-02-16 10:24:23-05:00	33.930	33.93	33.925	33.930	2200.0	12.0	2024-02-16 10:24:24.061319-05:00	33.929545	400.0	200.0
2024-02-16 10:24:24-05:00	33.930	33.93	33.930	33.930	5054.0	10.0	2024-02-16 10:24:25.017519-05:00	33.930000	0.0	0.0
2024-02-16 10:24:25-05:00	33.930	33.93	33.930	33.930	3712.0	8.0	2024-02-16 10:24:26.092502-05:00	33.930000	0.0	0.0
2024-02-16 10:24:26-05:00	33.925	33.93	33.915	33.920	7457.0	33.0	2024-02-16 10:24:27.112140-05:00	33.920636	699.0	1461.0

2077 rows × 10 columns

In [8]:

focohlc.info()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 focohlc.info()

NameError: name 'focohlc' is not defined

In [5]:

#trades_df.to_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet", engine='pyarrow', compression='gzip')
#trades_df = pd.read_parquet(dir + "trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet",engine='pyarrow')

#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')

In [6]:

trades_df

Out[6]:

		exchange	price	size	id	conditions	tape
symbol	timestamp
BAC	2024-01-02 09:30:00.329729-05:00	T	33.390	9684	62879149669684	[ , Q]	A
	2024-01-02 09:30:00.758040-05:00	P	33.430	700	52983525495600	[ ]	A
	2024-01-02 09:30:00.758997-05:00	P	33.440	400	52983525495601	[ ]	A
	2024-01-02 09:30:01.086662-05:00	N	33.460	429483	52983525028681	[ , Q]	A
	2024-01-02 09:30:01.086708-05:00	P	33.450	100	52983525495863	[ ]	A
	...	...	...	...	...	...	...
	2024-05-14 15:59:59.976990-04:00	N	38.490	42501	52983576482130	[ ]	A
	2024-05-14 15:59:59.977038-04:00	N	38.495	100	52983576482132	[ ]	A
	2024-05-14 15:59:59.977065-04:00	N	38.495	140	52983576482133	[ ]	A
	2024-05-14 15:59:59.977618-04:00	T	38.495	140	62883460167226	[ ]	A
	2024-05-14 15:59:59.978364-04:00	D	38.495	100	71714533561907	[ ]	A

6467196 rows × 6 columns

In [4]:

file_trades

Out[4]:

'/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet'

In [ ]:

#list all files is dir directory with parquet extension
dir = DATA_DIR + "/notebooks/"
import os
files = [f for f in os.listdir(dir) if f.endswith(".parquet")]
file_name = ""
ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')

In [ ]:

ohlcv_df

In [ ]:

import matplotlib.pyplot as plt
import seaborn as sns
# Calculate daily returns
ohlcv_df['returns'] = ohlcv_df['close'].pct_change().dropna()
#same as above but pct_change is from 3 datapoints back, but only if it is the same date, else na


# Plot the probability distribution curve
plt.figure(figsize=(10, 6))
sns.histplot(df['returns'].dropna(), kde=True, stat='probability', bins=30)
plt.title('Probability Distribution of Daily Returns')
plt.xlabel('Daily Returns')
plt.ylabel('Probability')
plt.show()

In [ ]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the intervals from 5 to 20 s, returns for each interval
#maybe use rolling window?
intervals = range(5, 21, 5)

# Create columns for percentage returns
rolling_window = 50

# Normalize the returns using rolling mean and std
for N in intervals:
    column_name = f'returns_{N}'
    rolling_mean = ohlcv_df[column_name].rolling(window=rolling_window).mean()
    rolling_std = ohlcv_df[column_name].rolling(window=rolling_window).std()
    ohlcv_df[f'norm_{column_name}'] = (ohlcv_df[column_name] - rolling_mean) / rolling_std

# Display the dataframe with normalized return columns
ohlcv_df

In [ ]:

# Calculate the sum of the normalized return columns for each row
ohlcv_df['sum_norm_returns'] = ohlcv_df[[f'norm_returns_{N}' for N in intervals]].sum(axis=1)

# Sort the DataFrame based on the sum of normalized returns in descending order
df_sorted = ohlcv_df.sort_values(by='sum_norm_returns', ascending=False)

# Display the top rows with the highest sum of normalized returns
df_sorted

In [ ]:

# Drop initial rows with NaN values due to pct_change
ohlcv_df.dropna(inplace=True)

# Plotting the probability distribution curves
plt.figure(figsize=(14, 8))
for N in intervals:
    sns.kdeplot(ohlcv_df[f'returns_{N}'].dropna(), label=f'Returns {N}', fill=True)

plt.title('Probability Distribution of Percentage Returns')
plt.xlabel('Percentage Return')
plt.ylabel('Density')
plt.legend()
plt.show()

In [ ]:

import matplotlib.pyplot as plt
import seaborn as sns
# Plot the probability distribution curve
plt.figure(figsize=(10, 6))
sns.histplot(ohlcv_df['returns'].dropna(), kde=True, stat='probability', bins=30)
plt.title('Probability Distribution of Daily Returns')
plt.xlabel('Daily Returns')
plt.ylabel('Probability')
plt.show()

In [ ]:

#show only rows from ohlcv_df where returns > 0.005
ohlcv_df[ohlcv_df['returns'] > 0.0005]

#ohlcv_df[ohlcv_df['returns'] < -0.005]

In [ ]:

#ohlcv where index = date 2024-03-13 and between hour 12

a = ohlcv_df.loc['2024-03-13 12:00:00':'2024-03-13 13:00:00']
a

In [ ]:

ohlcv_df

In [ ]:

trades_df

In [ ]:

ohlcv_df.info()

In [ ]:

trades_df.to_parquet("trades_df-spy-0111-0111.parquett", engine='pyarrow', compression='gzip')

In [ ]:

trades_df.to_parquet("trades_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip', allow_truncated_timestamps=True)

In [ ]:

ohlcv_df.to_parquet("ohlcv_df-spy-111-0516.parquett", engine='pyarrow', compression='gzip')

In [ ]:

basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)
vbt.settings['plotting']['auto_rangebreaks'] = True
basic_data.ohlcv.plot()

In [ ]:

#access just BCA
#df_filtered = df.loc["BAC"]

57 KiB Raw Blame History Unescape Escape

Loading trades and vectorized aggregation¶

57 KiB

Raw Blame History