Files
v2realbot/research/vectorized_loader.ipynb
2024-04-25 06:24:51 +02:00

621 lines
22 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import pyarrow\n",
"import numpy as np\n",
"from numba import jit\n",
"import v2realbot.utils.config_handler as cfh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Další info k pokračování je zde https://blog.quantinsti.com/tick-tick-ohlc-data-pandas-tutorial/"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 exchange 190261 non-null object \n",
" 1 price 190261 non-null float64\n",
" 2 size 190261 non-null float64\n",
" 3 id 190261 non-null int64 \n",
" 4 conditions 190261 non-null object \n",
" 5 tape 190261 non-null object \n",
"dtypes: float64(2), int64(1), object(3)\n",
"memory usage: 10.2+ MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>exchange</th>\n",
" <th>price</th>\n",
" <th>size</th>\n",
" <th>id</th>\n",
" <th>conditions</th>\n",
" <th>tape</th>\n",
" </tr>\n",
" <tr>\n",
" <th>timestamp</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.267711+00:00</th>\n",
" <td>K</td>\n",
" <td>36.890</td>\n",
" <td>5.0</td>\n",
" <td>52983525037630</td>\n",
" <td>[ , F, I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.300501+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241117014</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.305439+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241117496</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.314520+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241118034</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.335201+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241121369</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.902614+00:00</th>\n",
" <td>V</td>\n",
" <td>37.750</td>\n",
" <td>1100.0</td>\n",
" <td>56480705310575</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.977134+00:00</th>\n",
" <td>N</td>\n",
" <td>37.745</td>\n",
" <td>300.0</td>\n",
" <td>52983559963478</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.977137+00:00</th>\n",
" <td>N</td>\n",
" <td>37.740</td>\n",
" <td>7300.0</td>\n",
" <td>52983559963696</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.978626+00:00</th>\n",
" <td>V</td>\n",
" <td>37.750</td>\n",
" <td>16.0</td>\n",
" <td>56480706886228</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.987614+00:00</th>\n",
" <td>N</td>\n",
" <td>37.745</td>\n",
" <td>30.0</td>\n",
" <td>52983559963958</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>190261 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" exchange price size id \\\n",
"timestamp \n",
"2024-04-22 13:30:00.267711+00:00 K 36.890 5.0 52983525037630 \n",
"2024-04-22 13:30:00.300501+00:00 D 37.005 1.0 71675241117014 \n",
"2024-04-22 13:30:00.305439+00:00 D 37.005 1.0 71675241117496 \n",
"2024-04-22 13:30:00.314520+00:00 D 37.005 1.0 71675241118034 \n",
"2024-04-22 13:30:00.335201+00:00 D 37.005 1.0 71675241121369 \n",
"... ... ... ... ... \n",
"2024-04-22 19:59:59.902614+00:00 V 37.750 1100.0 56480705310575 \n",
"2024-04-22 19:59:59.977134+00:00 N 37.745 300.0 52983559963478 \n",
"2024-04-22 19:59:59.977137+00:00 N 37.740 7300.0 52983559963696 \n",
"2024-04-22 19:59:59.978626+00:00 V 37.750 16.0 56480706886228 \n",
"2024-04-22 19:59:59.987614+00:00 N 37.745 30.0 52983559963958 \n",
"\n",
" conditions tape \n",
"timestamp \n",
"2024-04-22 13:30:00.267711+00:00 [ , F, I] A \n",
"2024-04-22 13:30:00.300501+00:00 [ , I] A \n",
"2024-04-22 13:30:00.305439+00:00 [ , I] A \n",
"2024-04-22 13:30:00.314520+00:00 [ , I] A \n",
"2024-04-22 13:30:00.335201+00:00 [ , I] A \n",
"... ... ... \n",
"2024-04-22 19:59:59.902614+00:00 [ ] A \n",
"2024-04-22 19:59:59.977134+00:00 [ ] A \n",
"2024-04-22 19:59:59.977137+00:00 [ ] A \n",
"2024-04-22 19:59:59.978626+00:00 [ , I] A \n",
"2024-04-22 19:59:59.987614+00:00 [ , I] A \n",
"\n",
"[190261 rows x 6 columns]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tdf=pd.read_parquet('trades_bac.parquet',engine='pyarrow')\n",
"#print(df)\n",
"df = tdf.loc['BAC']\n",
"df.info()\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"@jit(nopython=True)\n",
"def ohlcv_bars(ticks, start_time, end_time, resolution):\n",
" \"\"\"\n",
" Generate OHLCV bars from tick data, skipping intervals without trading activity.\n",
" \n",
" Parameters:\n",
" - ticks: numpy array with columns [timestamp, price, size]\n",
" - start_time: the start timestamp for bars (Unix timestamp)\n",
" - end_time: the end timestamp for bars (Unix timestamp)\n",
" - resolution: time resolution in seconds\n",
" \n",
" Returns:\n",
" - OHLCV bars as a numpy array\n",
" \"\"\"\n",
" num_bars = (end_time - start_time) // resolution + 1\n",
" bar_list = []\n",
"\n",
" for i in range(num_bars):\n",
" bar_start_time = start_time + i * resolution\n",
" bar_end_time = bar_start_time + resolution\n",
" bar_ticks = ticks[(ticks[:, 0] >= bar_start_time) & (ticks[:, 0] < bar_end_time)]\n",
" \n",
" if bar_ticks.shape[0] == 0:\n",
" continue # Skip this bar as there are no ticks\n",
"\n",
" # Calculate OHLCV values\n",
" open_price = bar_ticks[0, 1] # open\n",
" high_price = np.max(bar_ticks[:, 1]) # high\n",
" low_price = np.min(bar_ticks[:, 1]) # low\n",
" close_price = bar_ticks[-1, 1] # close\n",
" volume = np.sum(bar_ticks[:, 2]) # volume\n",
" bar_time = bar_start_time # timestamp for the bar\n",
"\n",
" bar_list.append([open_price, high_price, low_price, close_price, volume, bar_time])\n",
"\n",
" # Convert list to numpy array\n",
" if bar_list:\n",
" ohlcv = np.array(bar_list)\n",
" else:\n",
" ohlcv = np.empty((0, 6)) # return an empty array if no bars were created\n",
"\n",
" return ohlcv\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 exchange 190261 non-null object \n",
" 1 price 190261 non-null float64\n",
" 2 size 190261 non-null float64\n",
" 3 id 190261 non-null int64 \n",
" 4 conditions 190261 non-null object \n",
" 5 tape 190261 non-null object \n",
"dtypes: float64(2), int64(1), object(3)\n",
"memory usage: 10.2+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 143751 entries, 2024-04-22 13:30:00.300501+00:00 to 2024-04-22 19:59:59.987614+00:00\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 exchange 143751 non-null object \n",
" 1 price 143751 non-null float64\n",
" 2 size 143751 non-null float64\n",
" 3 id 143751 non-null int64 \n",
" 4 conditions 143751 non-null object \n",
" 5 tape 143751 non-null object \n",
"dtypes: float64(2), int64(1), object(3)\n",
"memory usage: 7.7+ MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>exchange</th>\n",
" <th>price</th>\n",
" <th>size</th>\n",
" <th>id</th>\n",
" <th>conditions</th>\n",
" <th>tape</th>\n",
" </tr>\n",
" <tr>\n",
" <th>timestamp</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.300501+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241117014</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.305439+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241117496</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.314520+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241118034</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.335201+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241121369</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 13:30:00.346219+00:00</th>\n",
" <td>D</td>\n",
" <td>37.005</td>\n",
" <td>1.0</td>\n",
" <td>71675241122389</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.902614+00:00</th>\n",
" <td>V</td>\n",
" <td>37.750</td>\n",
" <td>1100.0</td>\n",
" <td>56480705310575</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.977134+00:00</th>\n",
" <td>N</td>\n",
" <td>37.745</td>\n",
" <td>300.0</td>\n",
" <td>52983559963478</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.977137+00:00</th>\n",
" <td>N</td>\n",
" <td>37.740</td>\n",
" <td>7300.0</td>\n",
" <td>52983559963696</td>\n",
" <td>[ ]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.978626+00:00</th>\n",
" <td>V</td>\n",
" <td>37.750</td>\n",
" <td>16.0</td>\n",
" <td>56480706886228</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-22 19:59:59.987614+00:00</th>\n",
" <td>N</td>\n",
" <td>37.745</td>\n",
" <td>30.0</td>\n",
" <td>52983559963958</td>\n",
" <td>[ , I]</td>\n",
" <td>A</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>143751 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" exchange price size id \\\n",
"timestamp \n",
"2024-04-22 13:30:00.300501+00:00 D 37.005 1.0 71675241117014 \n",
"2024-04-22 13:30:00.305439+00:00 D 37.005 1.0 71675241117496 \n",
"2024-04-22 13:30:00.314520+00:00 D 37.005 1.0 71675241118034 \n",
"2024-04-22 13:30:00.335201+00:00 D 37.005 1.0 71675241121369 \n",
"2024-04-22 13:30:00.346219+00:00 D 37.005 1.0 71675241122389 \n",
"... ... ... ... ... \n",
"2024-04-22 19:59:59.902614+00:00 V 37.750 1100.0 56480705310575 \n",
"2024-04-22 19:59:59.977134+00:00 N 37.745 300.0 52983559963478 \n",
"2024-04-22 19:59:59.977137+00:00 N 37.740 7300.0 52983559963696 \n",
"2024-04-22 19:59:59.978626+00:00 V 37.750 16.0 56480706886228 \n",
"2024-04-22 19:59:59.987614+00:00 N 37.745 30.0 52983559963958 \n",
"\n",
" conditions tape \n",
"timestamp \n",
"2024-04-22 13:30:00.300501+00:00 [ , I] A \n",
"2024-04-22 13:30:00.305439+00:00 [ , I] A \n",
"2024-04-22 13:30:00.314520+00:00 [ , I] A \n",
"2024-04-22 13:30:00.335201+00:00 [ , I] A \n",
"2024-04-22 13:30:00.346219+00:00 [ , I] A \n",
"... ... ... \n",
"2024-04-22 19:59:59.902614+00:00 [ ] A \n",
"2024-04-22 19:59:59.977134+00:00 [ ] A \n",
"2024-04-22 19:59:59.977137+00:00 [ ] A \n",
"2024-04-22 19:59:59.978626+00:00 [ , I] A \n",
"2024-04-22 19:59:59.987614+00:00 [ , I] A \n",
"\n",
"[143751 rows x 6 columns]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"excludes = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES')\n",
"print(excludes)\n",
"#excludes = [\"F\", \"I\"]\n",
"# FILTER EXCLUDED TRADES\n",
"# Filter rows to exclude those where 'conditions' contains 'F' or 'I'\n",
"# This simplifies the logic by directly using ~ (bitwise not operator) with np.isin\n",
"df = df[~df['conditions'].apply(lambda x: np.isin(x, excludes).any())]"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/8p/dwqnp65s0s77jdbm4_6z4vp80000gn/T/ipykernel_52602/3341929382.py:2: DeprecationWarning: parsing timezone aware datetimes is deprecated; this will raise an error in the future\n",
" structured_array = np.array(list(zip(df.index, df['price'], df['size'])),\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00)\n",
" ('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00)\n",
" ('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00) ...\n",
" ('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03)\n",
" ('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01)\n",
" ('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)]\n"
]
},
{
"data": {
"text/plain": [
"array([('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00),\n",
" ('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00),\n",
" ('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00), ...,\n",
" ('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03),\n",
" ('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01),\n",
" ('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)],\n",
" dtype=[('timestamp', '<M8[ns]'), ('price', '<f8'), ('size', '<f8')])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Creating a structured array with the timestamp as the first element\n",
"structured_array = np.array(list(zip(df.index, df['price'], df['size'])),\n",
" dtype=[('timestamp', 'datetime64[ns]'), ('price', 'float'), ('size', 'float')])\n",
"\n",
"print(structured_array)\n",
"structured_array\n",
"\n",
"# ticks = df[['index', 'price', 'size']].to_numpy()\n",
"# # ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000 # \n",
"# ticks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resolution_seconds = 1 # 1 second resolution\n",
"ohlcv_data = ohlcv_bars(structured_array, resolution_seconds)\n",
"\n",
"# Converting the result back to DataFrame for better usability\n",
"ohlcv_df = pd.DataFrame(ohlcv_data, columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Time'])\n",
"ohlcv_df['Time'] = pd.to_datetime(ohlcv_df['Time'], unit='s') # Convert timestamps back to datetime\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}