vectorized aggregator, minor changes (#198)

This commit is contained in:
David Brazda
2024-05-17 14:09:42 +02:00
committed by GitHub
parent 031b2427b9
commit 63c2f7e748
17 changed files with 36066 additions and 41746 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,316 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loading trades and vectorized aggregation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from numba import jit\n",
"from alpaca.data.historical import StockHistoricalDataClient\n",
"from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR\n",
"from alpaca.data.requests import StockTradesRequest\n",
"from v2realbot.enums.enums import BarType\n",
"import time\n",
"\n",
"from datetime import datetime\n",
"from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data\n",
"import pyarrow\n",
"from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades\n",
"import vectorbtpro as vbt\n",
"\n",
"vbt.settings.set_theme(\"dark\")\n",
"vbt.settings['plotting']['layout']['width'] = 1280\n",
"vbt.settings.plotting.auto_rangebreaks = True\n",
"# Set the option to display with pagination\n",
"pd.set_option('display.notebook_repr_html', True)\n",
"pd.set_option('display.max_rows', 10) # Number of rows per page"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"symbol = \"SPY\"\n",
"#datetime in zoneNY \n",
"day_start = datetime(2024, 5, 15, 9, 30, 0)\n",
"day_stop = datetime(2024, 5, 16, 16, 00, 0)\n",
"day_start = zoneNY.localize(day_start)\n",
"day_stop = zoneNY.localize(day_stop)\n",
"#neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?\n",
"df = fetch_trades_parallel(symbol, day_start, day_stop, minsize=50) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])\n",
"ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1, type=BarType.TIME)\n",
"#df.info()\n",
"ohlcv_df\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)\n",
"vbt.settings['plotting']['auto_rangebreaks'] = True\n",
"basic_data.ohlcv.plot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR\n",
"import gzip\n",
"\n",
"file_path = f\"{DATA_DIR}/tradecache/BAC-1709044200-1709067600.cache.gz\"\n",
"\n",
"with gzip.open(file_path, 'rb') as fp:\n",
" tradesResponse = pickle.load(fp)\n",
"\n",
"tradesResponse"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def convert_dict_to_multiindex_df(tradesResponse):\n",
" # Create a DataFrame for each key and add the key as part of the MultiIndex\n",
" dfs = []\n",
" for key, values in tradesResponse.items():\n",
" df = pd.DataFrame(values)\n",
" # Rename columns\n",
" # Select and order columns explicitly\n",
" #print(df)\n",
" df = df[['t', 'x', 'p', 's', 'i', 'c','z']]\n",
" df.rename(columns={'t': 'timestamp', 'c': 'conditions', 'p': 'price', 's': 'size', 'x': 'exchange', 'z':'tape', 'i':'id'}, inplace=True)\n",
" df['symbol'] = key # Add ticker as a column\n",
" df['timestamp'] = pd.to_datetime(df['timestamp']) # Convert 't' from string to datetime before setting it as an index\n",
" df.set_index(['symbol', 'timestamp'], inplace=True) # Set the multi-level index using both 'ticker' and 't'\n",
" df = df.tz_convert(zoneNY, level='timestamp')\n",
" dfs.append(df)\n",
"\n",
" # Concatenate all DataFrames into a single DataFrame with MultiIndex\n",
" final_df = pd.concat(dfs)\n",
"\n",
" return final_df\n",
"\n",
"# Convert and print the DataFrame\n",
"df = convert_dict_to_multiindex_df(tradesResponse)\n",
"df\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ohlcv_df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ohlcv_df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1000, type=\"dollar\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ohlcv_df.index.strftime('%Y-%m-%d %H').unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#ohlcv_df.groupby(ohlcv_df.index.date).size()\n",
"ohlcv_df.head(100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#access just BCA\n",
"df_filtered = df.loc[\"BAC\"]\n",
"\n",
"df_filtered.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_filtered= df_filtered.reset_index()\n",
"ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()\n",
"ticks\n",
"timestamps = ticks[:, 0]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_filtered= df_filtered.reset_index()\n",
"ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()\n",
"\n",
"#timestamp to integer\n",
"# Extract the timestamps column (assuming it's the first column)\n",
"timestamps = ticks[:, 0]\n",
"\n",
"# Convert the timestamps to Unix timestamps in seconds with microsecond precision\n",
"unix_timestamps_s = np.array([ts.timestamp() for ts in timestamps], dtype='float64')\n",
"\n",
"# Replace the original timestamps in the NumPy array with the converted Unix timestamps\n",
"ticks[:, 0] = unix_timestamps_s\n",
"\n",
"#ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000 # Convert to Unix timestamp\n",
"ticks\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ticks = ticks.astype(np.float64)\n",
"ticks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"resolution = 1 # Example resolution of 60 seconds\n",
"ohlcv_bars = generate_time_bars_nb(ticks, resolution)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ohlcv_bars"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert the resulting array back to a DataFrame\n",
"columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades']\n",
"ohlcv_df = pd.DataFrame(ohlcv_bars, columns=columns)\n",
"ohlcv_df['time'] = pd.to_datetime(ohlcv_df['time'], unit='s')\n",
"ohlcv_df.set_index('time', inplace=True)\n",
"ohlcv_df.index = ohlcv_df.index.tz_localize('UTC').tz_convert(zoneNY)\n",
"#ohlcv_df = ohlcv_df.loc[\"2024-03-1 15:50:00\":\"2024-03-28 13:40:00\"]\n",
"#ohlcv_df.index.strftime('%Y-%m-%d %H').unique()\n",
"\n",
"ohlcv_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

26673
research/rsi_alpaca.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

421
research/test1sbars.ipynb Normal file
View File

@ -0,0 +1,421 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from v2realbot.tools.loadbatch import load_batch\n",
"from v2realbot.utils.utils import zoneNY\n",
"import pandas as pd\n",
"import numpy as np\n",
"import vectorbtpro as vbt\n",
"from itables import init_notebook_mode, show\n",
"\n",
"init_notebook_mode(all_interactive=True)\n",
"\n",
"vbt.settings.set_theme(\"dark\")\n",
"vbt.settings['plotting']['layout']['width'] = 1280\n",
"vbt.settings.plotting.auto_rangebreaks = True\n",
"# Set the option to display with pagination\n",
"pd.set_option('display.notebook_repr_html', True)\n",
"pd.set_option('display.max_rows', 10) # Number of rows per page\n",
"\n",
"res, df = load_batch(batch_id=\"0fb5043a\", #46 days 1.3 - 6.5.\n",
" space_resolution_evenly=False,\n",
" indicators_columns=[\"Rsi14\"],\n",
" main_session_only=True,\n",
" verbose = False)\n",
"if res < 0:\n",
" print(\"Error\" + str(res) + str(df))\n",
"df = df[\"bars\"]\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# filter dates"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#filter na dny\n",
"# dates_of_interest = pd.to_datetime(['2024-04-22', '2024-04-23']).tz_localize('US/Eastern')\n",
"# filtered_df = df.loc[df.index.normalize().isin(dates_of_interest)]\n",
"\n",
"# df = filtered_df\n",
"# df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import plotly.io as pio\n",
"pio.renderers.default = 'notebook'\n",
"\n",
"#naloadujeme do vbt symbol as column\n",
"basic_data = vbt.Data.from_data({\"BAC\": df}, tz_convert=zoneNY)\n",
"start_date = pd.Timestamp('2024-03-12 09:30', tz=zoneNY)\n",
"end_date = pd.Timestamp('2024-03-13 16:00', tz=zoneNY)\n",
"\n",
"#basic_data = basic_data.transform(lambda df: df[df.index.date == start_date.date()])\n",
"#basic_data = basic_data.transform(lambda df: df[(df.index >= start_date) & (df.index <= end_date)])\n",
"#basic_data.data[\"BAC\"].info()\n",
"\n",
"# fig = basic_data.plot(plot_volume=False)\n",
"# pivot_info = basic_data.run(\"pivotinfo\", up_th=0.003, down_th=0.002)\n",
"# #pivot_info.plot()\n",
"# pivot_info.plot(fig=fig, conf_value_trace_kwargs=dict(visible=True))\n",
"# fig.show()\n",
"\n",
"\n",
"# rsi14 = basic_data.data[\"BAC\"][\"Rsi14\"].rename(\"Rsi14\")\n",
"\n",
"# rsi14.vbt.plot().show()\n",
"#basic_data.xloc[\"09:30\":\"10:00\"].data[\"BAC\"].vbt.ohlcv.plot().show()\n",
"\n",
"vbt.settings.plotting.auto_rangebreaks = True\n",
"#basic_data.data[\"BAC\"].vbt.ohlcv.plot()\n",
"\n",
"#basic_data.data[\"BAC\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"m1_data = basic_data[['Open', 'High', 'Low', 'Close', 'Volume']]\n",
"\n",
"m1_data.data[\"BAC\"]\n",
"#m5_data = m1_data.resample(\"5T\")\n",
"\n",
"#m5_data.data[\"BAC\"].head(10)\n",
"\n",
"# m15_data = m1_data.resample(\"15T\")\n",
"\n",
"# m15 = m15_data.data[\"BAC\"]\n",
"\n",
"# m15.vbt.ohlcv.plot()\n",
"\n",
"# m1_data.wrapper.index\n",
"\n",
"# m1_resampler = m1_data.wrapper.get_resampler(\"1T\")\n",
"# m1_resampler.index_difference(reverse=True)\n",
"\n",
"\n",
"# m5_resampler.prettify()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# defining ENTRY WINDOW and forced EXIT window"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#m1_data.data[\"BAC\"].info()\n",
"import datetime\n",
"# Define the market open and close times\n",
"market_open = datetime.time(9, 30)\n",
"market_close = datetime.time(16, 0)\n",
"entry_window_opens = 1\n",
"entry_window_closes = 350\n",
"\n",
"forced_exit_start = 380\n",
"forced_exit_end = 390\n",
"\n",
"forced_exit = m1_data.symbol_wrapper.fill(False)\n",
"entry_window_open= m1_data.symbol_wrapper.fill(False)\n",
"\n",
"# Calculate the time difference in minutes from market open for each timestamp\n",
"elapsed_min_from_open = (forced_exit.index.hour - market_open.hour) * 60 + (forced_exit.index.minute - market_open.minute)\n",
"\n",
"entry_window_open[(elapsed_min_from_open >= entry_window_opens) & (elapsed_min_from_open < entry_window_closes)] = True\n",
"forced_exit[(elapsed_min_from_open >= forced_exit_start) & (elapsed_min_from_open < forced_exit_end)] = True\n",
"\n",
"#entry_window_open.info()\n",
"# forced_exit.tail(100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"close = m1_data.close\n",
"\n",
"rsi = vbt.RSI.run(close, window=14)\n",
"\n",
"long_entries = (rsi.rsi.vbt.crossed_below(20) & entry_window_open)\n",
"long_exits = (rsi.rsi.vbt.crossed_above(70) | forced_exit)\n",
"#long_entries.info()\n",
"#number of trues and falses in long_entries\n",
"long_entries.value_counts()\n",
"#long_exits.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def plot_rsi(rsi, close, entries, exits):\n",
" fig = vbt.make_subplots(rows=1, cols=1, shared_xaxes=True, specs=[[{\"secondary_y\": True}]], vertical_spacing=0.02, subplot_titles=(\"RSI\", \"Price\" ))\n",
" close.vbt.plot(fig=fig, add_trace_kwargs=dict(secondary_y=True))\n",
" rsi.plot(fig=fig, add_trace_kwargs=dict(secondary_y=False))\n",
" entries.vbt.signals.plot_as_entries(rsi.rsi, fig=fig, add_trace_kwargs=dict(secondary_y=False)) \n",
" exits.vbt.signals.plot_as_exits(rsi.rsi, fig=fig, add_trace_kwargs=dict(secondary_y=False)) \n",
" return fig\n",
"\n",
"plot_rsi(rsi, close, long_entries, long_exits)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vbt.phelp(vbt.Portfolio.from_signals)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sl_stop = np.arange(0.03/100, 0.2/100, 0.02/100).tolist()\n",
"# Using the round function\n",
"sl_stop = [round(val, 4) for val in sl_stop]\n",
"print(sl_stop)\n",
"sl_stop = vbt.Param(sl_stop) #np.nan mean s no stoploss\n",
"\n",
"pf = vbt.Portfolio.from_signals(close=close, entries=long_entries, sl_stop=sl_stop, tp_stop = sl_stop, exits=long_exits,fees=0.0167/100, freq=\"1s\") #sl_stop=sl_stop, tp_stop = sl_stop, \n",
"\n",
"#pf.stats()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pf.plot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pf[(0.0015,0.0013)].plot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pf[0.03].plot_trade_signals()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# pristup k pf jako multi index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#pf[0.03].plot()\n",
"#pf.order_records\n",
"pf[(0.03)].stats()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#zgrupovane statistiky\n",
"stats_df = pf.stats([\n",
" 'total_return',\n",
" 'total_trades',\n",
" 'win_rate',\n",
" 'expectancy'\n",
"], agg_func=None)\n",
"stats_df\n",
"\n",
"\n",
"stats_df.nlargest(50, 'Total Return [%]')\n",
"#stats_df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pf[(0.0011,0.0013)].plot()\n",
"\n",
"#pf[(0.0011,0.0013000000000000002)].plot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pandas.tseries.offsets import DateOffset\n",
"\n",
"temp_data = basic_data['2024-4-22']\n",
"temp_data\n",
"res1m = temp_data[[\"Open\", \"High\", \"Low\", \"Close\", \"Volume\"]]\n",
"\n",
"# Define a custom date offset that starts at 9:30 AM and spans 4 hours\n",
"custom_offset = DateOffset(hours=4, minutes=30)\n",
"\n",
"# res1m = res1m.get().resample(\"4H\").agg({ \n",
"# \"Open\": \"first\",\n",
"# \"High\": \"max\",\n",
"# \"Low\": \"min\",\n",
"# \"Close\": \"last\",\n",
"# \"Volume\": \"sum\"\n",
"# })\n",
"\n",
"res4h = res1m.resample(\"1h\", resample_kwargs=dict(origin=\"start\"))\n",
"\n",
"res4h.data\n",
"\n",
"res15m = res1m.resample(\"15T\", resample_kwargs=dict(origin=\"start\"))\n",
"\n",
"res15m.data[\"BAC\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@vbt.njit\n",
"def long_entry_place_func_nb(c, low, close, time_in_ns, rsi14, window_open, window_close):\n",
" market_open_minutes = 570 # 9 hours * 60 minutes + 30 minutes\n",
"\n",
" for out_i in range(len(c.out)):\n",
" i = c.from_i + out_i\n",
"\n",
" current_minutes = vbt.dt_nb.hour_nb(time_in_ns[i]) * 60 + vbt.dt_nb.minute_nb(time_in_ns[i])\n",
" #print(\"current_minutes\", current_minutes)\n",
" # Calculate elapsed minutes since market open at 9:30 AM\n",
" elapsed_from_open = current_minutes - market_open_minutes\n",
" elapsed_from_open = elapsed_from_open if elapsed_from_open >= 0 else 0\n",
" #print( \"elapsed_from_open\", elapsed_from_open)\n",
"\n",
" #elapsed_from_open = elapsed_minutes_from_open_nb(time_in_ns) \n",
" in_window = elapsed_from_open > window_open and elapsed_from_open < window_close\n",
" #print(\"in_window\", in_window)\n",
" # if in_window:\n",
" # print(\"in window\")\n",
"\n",
" if in_window and rsi14[i] > 60: # and low[i, c.col] <= hit_price: # and hour == 9: # (4)!\n",
" return out_i\n",
" return -1\n",
"\n",
"@vbt.njit\n",
"def long_exit_place_func_nb(c, high, close, time_index, tp, sl): # (5)!\n",
" entry_i = c.from_i - c.wait\n",
" entry_price = close[entry_i, c.col]\n",
" hit_price = entry_price * (1 + tp)\n",
" stop_price = entry_price * (1 - sl)\n",
" for out_i in range(len(c.out)):\n",
" i = c.from_i + out_i\n",
" last_bar_of_day = vbt.dt_nb.day_changed_nb(time_index[i], time_index[i + 1])\n",
"\n",
" #print(next_day)\n",
" if last_bar_of_day: #pokud je dalsi next day, tak zavirame posledni\n",
" print(\"ted\",out_i)\n",
" return out_i\n",
" if close[i, c.col] >= hit_price or close[i, c.col] <= stop_price :\n",
" return out_i\n",
" return -1\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list('abcdefghij'))\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.sum()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@ -1,620 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import pyarrow\n",
"import numpy as np\n",
"from numba import jit\n",
"import v2realbot.utils.config_handler as cfh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Další info k pokračování je zde https://blog.quantinsti.com/tick-tick-ohlc-data-pandas-tutorial/"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 exchange 190261 non-null object \n",
" 1 price 190261 non-null float64\n",
" 2 size 190261 non-null float64\n",
" 3 id 190261 non-null int64 \n",
" 4 conditions 190261 non-null object \n",
" 5 tape 190261 non-null object \n",
"dtypes: float64(2), int64(1), object(3)\n",
"memory usage: 10.2+ MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>exchange</th>\n",
" <th>price</th>\n",
" <th>size</th>\n",
" <th>id</th>\n",
" <th>conditions</th>\n",
" <th>tape</th>\n",
" </tr>\n",
" <tr>\n",
" <th>timestamp</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.267711+00:00</th>\n",
" <td>K</td>\n",
" <td>36.890</td>\n",
" <td>5.0</td>\n",
" <td>52983525037630</td>\n",
" <td>[ , F, I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.300501+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241117014</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.305439+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241117496</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.314520+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241118034</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.335201+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241121369</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.902614+00:00</th>\n",
" <td>V</td>\n",
" <td>37.750</td>\n",
" <td>1100.0</td>\n",
" <td>56480705310575</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.977134+00:00</th>\n",
" <td>N</td>\n",
" <td>37.745</td>\n",
" <td>300.0</td>\n",
" <td>52983559963478</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.977137+00:00</th>\n",
" <td>N</td>\n",
" <td>37.740</td>\n",
" <td>7300.0</td>\n",
" <td>52983559963696</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.978626+00:00</th>\n",
" <td>V</td>\n",
" <td>37.750</td>\n",
" <td>16.0</td>\n",
" <td>56480706886228</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.987614+00:00</th>\n",
" <td>N</td>\n",
" <td>37.745</td>\n",
" <td>30.0</td>\n",
" <td>52983559963958</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>190261 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" exchange price size id \\\n",
"timestamp \n",
"2024-04-22 13:30:00.267711+00:00 K 36.890 5.0 52983525037630 \n",
"2024-04-22 13:30:00.300501+00:00 D 37.005 1.0 71675241117014 \n",
"2024-04-22 13:30:00.305439+00:00 D 37.005 1.0 71675241117496 \n",
"2024-04-22 13:30:00.314520+00:00 D 37.005 1.0 71675241118034 \n",
"2024-04-22 13:30:00.335201+00:00 D 37.005 1.0 71675241121369 \n",
"... ... ... ... ... \n",
"2024-04-22 19:59:59.902614+00:00 V 37.750 1100.0 56480705310575 \n",
"2024-04-22 19:59:59.977134+00:00 N 37.745 300.0 52983559963478 \n",
"2024-04-22 19:59:59.977137+00:00 N 37.740 7300.0 52983559963696 \n",
"2024-04-22 19:59:59.978626+00:00 V 37.750 16.0 56480706886228 \n",
"2024-04-22 19:59:59.987614+00:00 N 37.745 30.0 52983559963958 \n",
"\n",
" conditions tape \n",
"timestamp \n",
"2024-04-22 13:30:00.267711+00:00 [ , F, I] A \n",
"2024-04-22 13:30:00.300501+00:00 [ , I] A \n",
"2024-04-22 13:30:00.305439+00:00 [ , I] A \n",
"2024-04-22 13:30:00.314520+00:00 [ , I] A \n",
"2024-04-22 13:30:00.335201+00:00 [ , I] A \n",
"... ... ... \n",
"2024-04-22 19:59:59.902614+00:00 [ ] A \n",
"2024-04-22 19:59:59.977134+00:00 [ ] A \n",
"2024-04-22 19:59:59.977137+00:00 [ ] A \n",
"2024-04-22 19:59:59.978626+00:00 [ , I] A \n",
"2024-04-22 19:59:59.987614+00:00 [ , I] A \n",
"\n",
"[190261 rows x 6 columns]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tdf=pd.read_parquet('trades_bac.parquet',engine='pyarrow')\n",
"#print(df)\n",
"df = tdf.loc['BAC']\n",
"df.info()\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"@jit(nopython=True)\n",
"def ohlcv_bars(ticks, start_time, end_time, resolution):\n",
" \"\"\"\n",
" Generate OHLCV bars from tick data, skipping intervals without trading activity.\n",
" \n",
" Parameters:\n",
" - ticks: numpy array with columns [timestamp, price, size]\n",
" - start_time: the start timestamp for bars (Unix timestamp)\n",
" - end_time: the end timestamp for bars (Unix timestamp)\n",
" - resolution: time resolution in seconds\n",
" \n",
" Returns:\n",
" - OHLCV bars as a numpy array\n",
" \"\"\"\n",
" num_bars = (end_time - start_time) // resolution + 1\n",
" bar_list = []\n",
"\n",
" for i in range(num_bars):\n",
" bar_start_time = start_time + i * resolution\n",
" bar_end_time = bar_start_time + resolution\n",
" bar_ticks = ticks[(ticks[:, 0] >= bar_start_time) & (ticks[:, 0] < bar_end_time)]\n",
" \n",
" if bar_ticks.shape[0] == 0:\n",
" continue # Skip this bar as there are no ticks\n",
"\n",
" # Calculate OHLCV values\n",
" open_price = bar_ticks[0, 1] # open\n",
" high_price = np.max(bar_ticks[:, 1]) # high\n",
" low_price = np.min(bar_ticks[:, 1]) # low\n",
" close_price = bar_ticks[-1, 1] # close\n",
" volume = np.sum(bar_ticks[:, 2]) # volume\n",
" bar_time = bar_start_time # timestamp for the bar\n",
"\n",
" bar_list.append([open_price, high_price, low_price, close_price, volume, bar_time])\n",
"\n",
" # Convert list to numpy array\n",
" if bar_list:\n",
" ohlcv = np.array(bar_list)\n",
" else:\n",
" ohlcv = np.empty((0, 6)) # return an empty array if no bars were created\n",
"\n",
" return ohlcv\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 exchange 190261 non-null object \n",
" 1 price 190261 non-null float64\n",
" 2 size 190261 non-null float64\n",
" 3 id 190261 non-null int64 \n",
" 4 conditions 190261 non-null object \n",
" 5 tape 190261 non-null object \n",
"dtypes: float64(2), int64(1), object(3)\n",
"memory usage: 10.2+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 143751 entries, 2024-04-22 13:30:00.300501+00:00 to 2024-04-22 19:59:59.987614+00:00\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 exchange 143751 non-null object \n",
" 1 price 143751 non-null float64\n",
" 2 size 143751 non-null float64\n",
" 3 id 143751 non-null int64 \n",
" 4 conditions 143751 non-null object \n",
" 5 tape 143751 non-null object \n",
"dtypes: float64(2), int64(1), object(3)\n",
"memory usage: 7.7+ MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>exchange</th>\n",
" <th>price</th>\n",
" <th>size</th>\n",
" <th>id</th>\n",
" <th>conditions</th>\n",
" <th>tape</th>\n",
" </tr>\n",
" <tr>\n",
" <th>timestamp</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.300501+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241117014</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.305439+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241117496</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.314520+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241118034</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.335201+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241121369</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.346219+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241122389</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.902614+00:00</th>\n",
" <td>V</td>\n",
" <td>37.750</td>\n",
" <td>1100.0</td>\n",
" <td>56480705310575</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.977134+00:00</th>\n",
" <td>N</td>\n",
" <td>37.745</td>\n",
" <td>300.0</td>\n",
" <td>52983559963478</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.977137+00:00</th>\n",
" <td>N</td>\n",
" <td>37.740</td>\n",
" <td>7300.0</td>\n",
" <td>52983559963696</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.978626+00:00</th>\n",
" <td>V</td>\n",
" <td>37.750</td>\n",
" <td>16.0</td>\n",
" <td>56480706886228</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.987614+00:00</th>\n",
" <td>N</td>\n",
" <td>37.745</td>\n",
" <td>30.0</td>\n",
" <td>52983559963958</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>143751 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" exchange price size id \\\n",
"timestamp \n",
"2024-04-22 13:30:00.300501+00:00 D 37.005 1.0 71675241117014 \n",
"2024-04-22 13:30:00.305439+00:00 D 37.005 1.0 71675241117496 \n",
"2024-04-22 13:30:00.314520+00:00 D 37.005 1.0 71675241118034 \n",
"2024-04-22 13:30:00.335201+00:00 D 37.005 1.0 71675241121369 \n",
"2024-04-22 13:30:00.346219+00:00 D 37.005 1.0 71675241122389 \n",
"... ... ... ... ... \n",
"2024-04-22 19:59:59.902614+00:00 V 37.750 1100.0 56480705310575 \n",
"2024-04-22 19:59:59.977134+00:00 N 37.745 300.0 52983559963478 \n",
"2024-04-22 19:59:59.977137+00:00 N 37.740 7300.0 52983559963696 \n",
"2024-04-22 19:59:59.978626+00:00 V 37.750 16.0 56480706886228 \n",
"2024-04-22 19:59:59.987614+00:00 N 37.745 30.0 52983559963958 \n",
"\n",
" conditions tape \n",
"timestamp \n",
"2024-04-22 13:30:00.300501+00:00 [ , I] A \n",
"2024-04-22 13:30:00.305439+00:00 [ , I] A \n",
"2024-04-22 13:30:00.314520+00:00 [ , I] A \n",
"2024-04-22 13:30:00.335201+00:00 [ , I] A \n",
"2024-04-22 13:30:00.346219+00:00 [ , I] A \n",
"... ... ... \n",
"2024-04-22 19:59:59.902614+00:00 [ ] A \n",
"2024-04-22 19:59:59.977134+00:00 [ ] A \n",
"2024-04-22 19:59:59.977137+00:00 [ ] A \n",
"2024-04-22 19:59:59.978626+00:00 [ , I] A \n",
"2024-04-22 19:59:59.987614+00:00 [ , I] A \n",
"\n",
"[143751 rows x 6 columns]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"excludes = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES')\n",
"print(excludes)\n",
"#excludes = [\"F\", \"I\"]\n",
"# FILTER EXCLUDED TRADES\n",
"# Filter rows to exclude those where 'conditions' contains 'F' or 'I'\n",
"# This simplifies the logic by directly using ~ (bitwise not operator) with np.isin\n",
"df = df[~df['conditions'].apply(lambda x: np.isin(x, excludes).any())]"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/8p/dwqnp65s0s77jdbm4_6z4vp80000gn/T/ipykernel_52602/3341929382.py:2: DeprecationWarning: parsing timezone aware datetimes is deprecated; this will raise an error in the future\n",
" structured_array = np.array(list(zip(df.index, df['price'], df['size'])),\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00)\n",
" ('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00)\n",
" ('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00) ...\n",
" ('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03)\n",
" ('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01)\n",
" ('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)]\n"
]
},
{
"data": {
"text/plain": [
"array([('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00),\n",
" ('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00),\n",
" ('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00), ...,\n",
" ('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03),\n",
" ('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01),\n",
" ('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)],\n",
" dtype=[('timestamp', '<M8[ns]'), ('price', '<f8'), ('size', '<f8')])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Creating a structured array with the timestamp as the first element\n",
"structured_array = np.array(list(zip(df.index, df['price'], df['size'])),\n",
" dtype=[('timestamp', 'datetime64[ns]'), ('price', 'float'), ('size', 'float')])\n",
"\n",
"print(structured_array)\n",
"structured_array\n",
"\n",
"# ticks = df[['index', 'price', 'size']].to_numpy()\n",
"# # ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000 # \n",
"# ticks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resolution_seconds = 1 # 1 second resolution\n",
"ohlcv_data = ohlcv_bars(structured_array, resolution_seconds)\n",
"\n",
"# Converting the result back to DataFrame for better usability\n",
"ohlcv_df = pd.DataFrame(ohlcv_data, columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Time'])\n",
"ohlcv_df['Time'] = pd.to_datetime(ohlcv_df['Time'], unit='s') # Convert timestamps back to datetime\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}