{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading trades and vectorized aggregation\n", "This notebook fetches the trades from remote or local cache and aggregates them to bars of given type (time, volume, dollar) and resolution\n", "\n", "`fetch_trades_parallel` enables to fetch trades of given symbol and interval, also can filter conditions and minimum size. return `trades_df`\n", "`aggregate_trades` acceptss `trades_df` and ressolution and type of bars (VOLUME, TIME, DOLLAR) and return aggregated ohlcv dataframe `ohlcv_df`" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet\n", "/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet\n", "trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet\n", "trades_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet\n", "trades_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet\n", "trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\n", "ohlcv_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet\n", "trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\n", "ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\n", "ohlcv_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet\n", "ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet\n", "ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\n", "ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from numba import jit\n", "from alpaca.data.historical import StockHistoricalDataClient\n", "from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR\n", "from alpaca.data.requests import StockTradesRequest\n", "from v2realbot.enums.enums import BarType\n", "import time\n", "from datetime import datetime\n", "from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data\n", "import pyarrow\n", "from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades\n", "import vectorbtpro as vbt\n", "import v2realbot.utils.config_handler as cfh\n", "\n", "vbt.settings.set_theme(\"dark\")\n", "vbt.settings['plotting']['layout']['width'] = 1280\n", "vbt.settings.plotting.auto_rangebreaks = True\n", "# Set the option to display with pagination\n", "pd.set_option('display.notebook_repr_html', True)\n", "pd.set_option('display.max_rows', 20) # Number of rows per page\n", "# pd.set_option('display.float_format', '{:.9f}'.format)\n", "\n", "\n", "#trade filtering\n", "exclude_conditions = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES') #standard ['C','O','4','B','7','V','P','W','U','Z','F']\n", "minsize = 100\n", "\n", "symbol = \"BAC\"\n", "#datetime in zoneNY \n", "day_start = datetime(2023, 1, 1, 9, 30, 0)\n", "day_stop = datetime(2024, 5, 25, 15, 30, 0)\n", "day_start = zoneNY.localize(day_start)\n", "day_stop = zoneNY.localize(day_stop)\n", "#filename of trades_df parquet, date are in isoformat but without time zone part\n", "dir = DATA_DIR + \"/notebooks/\"\n", "#parquet interval cache contains exclude conditions and minsize filtering\n", "file_trades = dir + f\"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet\"\n", "#file_trades = dir + f\"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}.parquet\"\n", "file_ohlcv = dir + f\"ohlcv_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H_%M_%S')}-{day_stop.strftime('%Y-%m-%dT%H_%M_%S')}-{''.join(exclude_conditions)}-{minsize}.parquet\"\n", "print(file_trades)\n", "print(file_ohlcv)\n", "#PRINT all parquet in directory\n", "import os\n", "files = [f for f in os.listdir(dir) if f.endswith(\".parquet\")]\n", "for f in files:\n", " print(f)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "#Either load trades or ohlcv from parquet if exists\n", "#trades_df = fetch_trades_parallel(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=minsize, max_workers=30) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])\n", "#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')\n", "#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')\n", "#filenames = [dir+\"trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet\",dir+\"trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\"]\n", "trades_df = pd.read_parquet(dir+\"trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\",engine='pyarrow')\n", "#focused = trades_df.loc[\"2024-02-16 11:23:11\":\"2024-02-16 11:24:26\"]\n", "#focused\n", "ohlcv_df = aggregate_trades(symbol=symbol, trades_df=trades_df, resolution=1, type=BarType.TIME)\n", "ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow', compression='gzip')\n", "\n", "#ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')\n", "# trades_df = pd.read_parquet(file_trades,engine='pyarrow')\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "DatetimeIndex: 4549772 entries, 2023-01-03 09:30:01-05:00 to 2024-05-24 15:59:59-04:00\n", "Data columns (total 10 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 open float64 \n", " 1 high float64 \n", " 2 low float64 \n", " 3 close float64 \n", " 4 volume float64 \n", " 5 trades float64 \n", " 6 updated datetime64[ns, US/Eastern]\n", " 7 vwap float64 \n", " 8 buyvolume float64 \n", " 9 sellvolume float64 \n", "dtypes: datetime64[ns, US/Eastern](1), float64(9)\n", "memory usage: 381.8 MB\n" ] } ], "source": [ "ohlcv_df.info()\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
exchangepricesizeidconditionstape
symboltimestamp
BAC2024-02-16 09:30:00.708657-05:00P33.9220052983525230899[ ]A
2024-02-16 09:30:00.708657-05:00P33.9220052983525230900[ , Q]A
2024-02-16 09:30:00.708660-05:00P33.9220052983525230901[ ]A
2024-02-16 09:30:00.708687-05:00P33.9220052983525230903[ ]A
2024-02-16 09:30:00.708744-05:00P33.9210052983525230904[ ]A
.....................
2024-02-16 09:32:10.624595-05:00P33.8640052983525421904[ ]A
2024-02-16 09:32:10.625001-05:00K33.8640052983525100399[ ]A
2024-02-16 09:32:10.625151-05:00T33.8610062879199619393[ ]A
2024-02-16 09:32:10.625157-05:00T33.8615062879199619395[ ]A
2024-02-16 09:32:10.625157-05:00T33.8640062879199619396[ ]A
\n", "

839 rows × 6 columns

\n", "
" ], "text/plain": [ " exchange price size id \\\n", "symbol timestamp \n", "BAC 2024-02-16 09:30:00.708657-05:00 P 33.92 200 52983525230899 \n", " 2024-02-16 09:30:00.708657-05:00 P 33.92 200 52983525230900 \n", " 2024-02-16 09:30:00.708660-05:00 P 33.92 200 52983525230901 \n", " 2024-02-16 09:30:00.708687-05:00 P 33.92 200 52983525230903 \n", " 2024-02-16 09:30:00.708744-05:00 P 33.92 100 52983525230904 \n", "... ... ... ... ... \n", " 2024-02-16 09:32:10.624595-05:00 P 33.86 400 52983525421904 \n", " 2024-02-16 09:32:10.625001-05:00 K 33.86 400 52983525100399 \n", " 2024-02-16 09:32:10.625151-05:00 T 33.86 100 62879199619393 \n", " 2024-02-16 09:32:10.625157-05:00 T 33.86 150 62879199619395 \n", " 2024-02-16 09:32:10.625157-05:00 T 33.86 400 62879199619396 \n", "\n", " conditions tape \n", "symbol timestamp \n", "BAC 2024-02-16 09:30:00.708657-05:00 [ ] A \n", " 2024-02-16 09:30:00.708657-05:00 [ , Q] A \n", " 2024-02-16 09:30:00.708660-05:00 [ ] A \n", " 2024-02-16 09:30:00.708687-05:00 [ ] A \n", " 2024-02-16 09:30:00.708744-05:00 [ ] A \n", "... ... ... \n", " 2024-02-16 09:32:10.624595-05:00 [ ] A \n", " 2024-02-16 09:32:10.625001-05:00 [ ] A \n", " 2024-02-16 09:32:10.625151-05:00 [ ] A \n", " 2024-02-16 09:32:10.625157-05:00 [ ] A \n", " 2024-02-16 09:32:10.625157-05:00 [ ] A \n", "\n", "[839 rows x 6 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = trades_df.loc[(\"BAC\", \"2024-02-16 09:30\"):(\"BAC\",\"2024-02-16 09:32:11\")]\n", "a" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
openhighlowclosevolumetradesupdatedvwapbuyvolumesellvolume
time
2024-02-15 15:58:00-05:0034.045034.049934.040034.0457032.019.02024-02-15 15:58:01.013622-05:0034.0451091976.01663.0
2024-02-15 15:58:01-05:0034.045034.050034.040134.0459216.039.02024-02-15 15:58:02.103157-05:0034.048374600.0733.0
2024-02-15 15:58:02-05:0034.049934.050034.040134.0502404.020.02024-02-15 15:58:03.139942-05:0034.048829622.0382.0
2024-02-15 15:58:03-05:0034.049934.050034.042534.0502481.011.02024-02-15 15:58:04.047101-05:0034.046025501.0565.0
2024-02-15 15:58:04-05:0034.050034.059934.050034.05521532.057.02024-02-15 15:58:05.143236-05:0034.0516781465.01637.0
.................................
2024-02-16 09:34:52-05:0033.705033.705033.705033.705100.01.02024-02-16 09:34:53.669980-05:0033.7050000.0100.0
2024-02-16 09:34:53-05:0033.705033.705033.705033.705300.01.02024-02-16 09:34:55.212184-05:0033.7050000.00.0
2024-02-16 09:34:55-05:0033.709233.709233.705033.705304.02.02024-02-16 09:34:56.304454-05:0033.706382100.0204.0
2024-02-16 09:34:56-05:0033.705033.715033.705033.715900.09.02024-02-16 09:34:58.166409-05:0033.711667200.00.0
2024-02-16 09:34:58-05:0033.715033.715033.710033.7102461.014.02024-02-16 09:34:58.958611-05:0033.710707100.0200.0
\n", "

324 rows × 10 columns

\n", "
" ], "text/plain": [ " open high low close volume trades \\\n", "time \n", "2024-02-15 15:58:00-05:00 34.0450 34.0499 34.0400 34.045 7032.0 19.0 \n", "2024-02-15 15:58:01-05:00 34.0450 34.0500 34.0401 34.045 9216.0 39.0 \n", "2024-02-15 15:58:02-05:00 34.0499 34.0500 34.0401 34.050 2404.0 20.0 \n", "2024-02-15 15:58:03-05:00 34.0499 34.0500 34.0425 34.050 2481.0 11.0 \n", "2024-02-15 15:58:04-05:00 34.0500 34.0599 34.0500 34.055 21532.0 57.0 \n", "... ... ... ... ... ... ... \n", "2024-02-16 09:34:52-05:00 33.7050 33.7050 33.7050 33.705 100.0 1.0 \n", "2024-02-16 09:34:53-05:00 33.7050 33.7050 33.7050 33.705 300.0 1.0 \n", "2024-02-16 09:34:55-05:00 33.7092 33.7092 33.7050 33.705 304.0 2.0 \n", "2024-02-16 09:34:56-05:00 33.7050 33.7150 33.7050 33.715 900.0 9.0 \n", "2024-02-16 09:34:58-05:00 33.7150 33.7150 33.7100 33.710 2461.0 14.0 \n", "\n", " updated vwap \\\n", "time \n", "2024-02-15 15:58:00-05:00 2024-02-15 15:58:01.013622-05:00 34.045109 \n", "2024-02-15 15:58:01-05:00 2024-02-15 15:58:02.103157-05:00 34.048374 \n", "2024-02-15 15:58:02-05:00 2024-02-15 15:58:03.139942-05:00 34.048829 \n", "2024-02-15 15:58:03-05:00 2024-02-15 15:58:04.047101-05:00 34.046025 \n", "2024-02-15 15:58:04-05:00 2024-02-15 15:58:05.143236-05:00 34.051678 \n", "... ... ... \n", "2024-02-16 09:34:52-05:00 2024-02-16 09:34:53.669980-05:00 33.705000 \n", "2024-02-16 09:34:53-05:00 2024-02-16 09:34:55.212184-05:00 33.705000 \n", "2024-02-16 09:34:55-05:00 2024-02-16 09:34:56.304454-05:00 33.706382 \n", "2024-02-16 09:34:56-05:00 2024-02-16 09:34:58.166409-05:00 33.711667 \n", "2024-02-16 09:34:58-05:00 2024-02-16 09:34:58.958611-05:00 33.710707 \n", "\n", " buyvolume sellvolume \n", "time \n", "2024-02-15 15:58:00-05:00 1976.0 1663.0 \n", "2024-02-15 15:58:01-05:00 600.0 733.0 \n", "2024-02-15 15:58:02-05:00 622.0 382.0 \n", "2024-02-15 15:58:03-05:00 501.0 565.0 \n", "2024-02-15 15:58:04-05:00 1465.0 1637.0 \n", "... ... ... \n", "2024-02-16 09:34:52-05:00 0.0 100.0 \n", "2024-02-16 09:34:53-05:00 0.0 0.0 \n", "2024-02-16 09:34:55-05:00 100.0 204.0 \n", "2024-02-16 09:34:56-05:00 200.0 0.0 \n", "2024-02-16 09:34:58-05:00 100.0 200.0 \n", "\n", "[324 rows x 10 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ohlcv_df" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
exchangepricesizeidconditionstape
symboltimestamp
BAC2024-02-16 09:30:00.708657-05:00P33.92020052983525230899[ ]A
2024-02-16 09:30:00.708657-05:00P33.92020052983525230900[ , Q]A
2024-02-16 09:30:00.708660-05:00P33.92020052983525230901[ ]A
2024-02-16 09:30:00.708687-05:00P33.92020052983525230903[ ]A
2024-02-16 09:30:00.708744-05:00P33.92010052983525230904[ ]A
.....................
2024-02-16 10:24:25.203728-05:00D33.93043471679923009363[ ]A
2024-02-16 10:24:25.204664-05:00D33.93011871679923009364[ ]A
2024-02-16 10:24:25.269020-05:00D33.93058071679923009601[ ]A
2024-02-16 10:24:25.296243-05:00D33.93032571679923009612[ ]A
2024-02-16 10:24:26.092502-05:00N33.92527552983526543427[ ]A
\n", "

12032 rows × 6 columns

\n", "
" ], "text/plain": [ " exchange price size \\\n", "symbol timestamp \n", "BAC 2024-02-16 09:30:00.708657-05:00 P 33.920 200 \n", " 2024-02-16 09:30:00.708657-05:00 P 33.920 200 \n", " 2024-02-16 09:30:00.708660-05:00 P 33.920 200 \n", " 2024-02-16 09:30:00.708687-05:00 P 33.920 200 \n", " 2024-02-16 09:30:00.708744-05:00 P 33.920 100 \n", "... ... ... ... \n", " 2024-02-16 10:24:25.203728-05:00 D 33.930 434 \n", " 2024-02-16 10:24:25.204664-05:00 D 33.930 118 \n", " 2024-02-16 10:24:25.269020-05:00 D 33.930 580 \n", " 2024-02-16 10:24:25.296243-05:00 D 33.930 325 \n", " 2024-02-16 10:24:26.092502-05:00 N 33.925 275 \n", "\n", " id conditions tape \n", "symbol timestamp \n", "BAC 2024-02-16 09:30:00.708657-05:00 52983525230899 [ ] A \n", " 2024-02-16 09:30:00.708657-05:00 52983525230900 [ , Q] A \n", " 2024-02-16 09:30:00.708660-05:00 52983525230901 [ ] A \n", " 2024-02-16 09:30:00.708687-05:00 52983525230903 [ ] A \n", " 2024-02-16 09:30:00.708744-05:00 52983525230904 [ ] A \n", "... ... ... ... \n", " 2024-02-16 10:24:25.203728-05:00 71679923009363 [ ] A \n", " 2024-02-16 10:24:25.204664-05:00 71679923009364 [ ] A \n", " 2024-02-16 10:24:25.269020-05:00 71679923009601 [ ] A \n", " 2024-02-16 10:24:25.296243-05:00 71679923009612 [ ] A \n", " 2024-02-16 10:24:26.092502-05:00 52983526543427 [ ] A \n", "\n", "[12032 rows x 6 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#trades_df.info()\n", "focused = trades_df.loc[(\"BAC\", \"2024-02-16 09:30:00\"):(\"BAC\", \"2024-02-16 10:24:26\")]\n", "focused" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
exchangepricesizeidconditionstape
symboltimestamp
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [exchange, price, size, id, conditions, tape]\n", "Index: []" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trades_df.loc[\"2024-02-16 09:30:00\":\"2024-02-16 10:24:26\"]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
openhighlowclosevolumetradesupdatedvwapbuyvolumesellvolume
time
2024-02-16 09:30:00-05:0033.92033.9233.90033.9001700.012.02024-02-16 09:30:01.023464-05:0033.9158820.0400.0
2024-02-16 09:30:01-05:0033.87033.9633.86033.960931216.0110.02024-02-16 09:30:02.118683-05:0033.949280912802.04471.0
2024-02-16 09:30:02-05:0033.96033.9833.94533.9451160.05.02024-02-16 09:30:03.440994-05:0033.959655180.0300.0
2024-02-16 09:30:03-05:0033.93033.9533.92133.9255376.013.02024-02-16 09:30:04.515116-05:0033.9346001548.0991.0
2024-02-16 09:30:04-05:0033.90033.9433.87033.8904759.013.02024-02-16 09:30:05.163964-05:0033.874182570.02889.0
.................................
2024-02-16 10:24:22-05:0033.93033.9333.93033.9301626.08.02024-02-16 10:24:23.246513-05:0033.9300000.00.0
2024-02-16 10:24:23-05:0033.93033.9333.92533.9302200.012.02024-02-16 10:24:24.061319-05:0033.929545400.0200.0
2024-02-16 10:24:24-05:0033.93033.9333.93033.9305054.010.02024-02-16 10:24:25.017519-05:0033.9300000.00.0
2024-02-16 10:24:25-05:0033.93033.9333.93033.9303712.08.02024-02-16 10:24:26.092502-05:0033.9300000.00.0
2024-02-16 10:24:26-05:0033.92533.9333.91533.9207457.033.02024-02-16 10:24:27.112140-05:0033.920636699.01461.0
\n", "

2077 rows × 10 columns

\n", "
" ], "text/plain": [ " open high low close volume trades \\\n", "time \n", "2024-02-16 09:30:00-05:00 33.920 33.92 33.900 33.900 1700.0 12.0 \n", "2024-02-16 09:30:01-05:00 33.870 33.96 33.860 33.960 931216.0 110.0 \n", "2024-02-16 09:30:02-05:00 33.960 33.98 33.945 33.945 1160.0 5.0 \n", "2024-02-16 09:30:03-05:00 33.930 33.95 33.921 33.925 5376.0 13.0 \n", "2024-02-16 09:30:04-05:00 33.900 33.94 33.870 33.890 4759.0 13.0 \n", "... ... ... ... ... ... ... \n", "2024-02-16 10:24:22-05:00 33.930 33.93 33.930 33.930 1626.0 8.0 \n", "2024-02-16 10:24:23-05:00 33.930 33.93 33.925 33.930 2200.0 12.0 \n", "2024-02-16 10:24:24-05:00 33.930 33.93 33.930 33.930 5054.0 10.0 \n", "2024-02-16 10:24:25-05:00 33.930 33.93 33.930 33.930 3712.0 8.0 \n", "2024-02-16 10:24:26-05:00 33.925 33.93 33.915 33.920 7457.0 33.0 \n", "\n", " updated vwap \\\n", "time \n", "2024-02-16 09:30:00-05:00 2024-02-16 09:30:01.023464-05:00 33.915882 \n", "2024-02-16 09:30:01-05:00 2024-02-16 09:30:02.118683-05:00 33.949280 \n", "2024-02-16 09:30:02-05:00 2024-02-16 09:30:03.440994-05:00 33.959655 \n", "2024-02-16 09:30:03-05:00 2024-02-16 09:30:04.515116-05:00 33.934600 \n", "2024-02-16 09:30:04-05:00 2024-02-16 09:30:05.163964-05:00 33.874182 \n", "... ... ... \n", "2024-02-16 10:24:22-05:00 2024-02-16 10:24:23.246513-05:00 33.930000 \n", "2024-02-16 10:24:23-05:00 2024-02-16 10:24:24.061319-05:00 33.929545 \n", "2024-02-16 10:24:24-05:00 2024-02-16 10:24:25.017519-05:00 33.930000 \n", "2024-02-16 10:24:25-05:00 2024-02-16 10:24:26.092502-05:00 33.930000 \n", "2024-02-16 10:24:26-05:00 2024-02-16 10:24:27.112140-05:00 33.920636 \n", "\n", " buyvolume sellvolume \n", "time \n", "2024-02-16 09:30:00-05:00 0.0 400.0 \n", "2024-02-16 09:30:01-05:00 912802.0 4471.0 \n", "2024-02-16 09:30:02-05:00 180.0 300.0 \n", "2024-02-16 09:30:03-05:00 1548.0 991.0 \n", "2024-02-16 09:30:04-05:00 570.0 2889.0 \n", "... ... ... \n", "2024-02-16 10:24:22-05:00 0.0 0.0 \n", "2024-02-16 10:24:23-05:00 400.0 200.0 \n", "2024-02-16 10:24:24-05:00 0.0 0.0 \n", "2024-02-16 10:24:25-05:00 0.0 0.0 \n", "2024-02-16 10:24:26-05:00 699.0 1461.0 \n", "\n", "[2077 rows x 10 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "focohlc = ohlcv_df.loc[\"2024-02-16 09:30:00\":\"2024-02-16 10:24:26\"]\n", "focohlc\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'focohlc' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfocohlc\u001b[49m\u001b[38;5;241m.\u001b[39minfo()\n", "\u001b[0;31mNameError\u001b[0m: name 'focohlc' is not defined" ] } ], "source": [ "focohlc.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#trades_df.to_parquet(dir + \"trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet\", engine='pyarrow', compression='gzip')\n", "#trades_df = pd.read_parquet(dir + \"trades_df-BAC-2024-01-01T09:30:00-2024-05-14T16:00:00-CO4B7VPWUZF-100.parquet\",engine='pyarrow')\n", "\n", "#trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
exchangepricesizeidconditionstape
symboltimestamp
BAC2024-01-02 09:30:00.329729-05:00T33.390968462879149669684[ , Q]A
2024-01-02 09:30:00.758040-05:00P33.43070052983525495600[ ]A
2024-01-02 09:30:00.758997-05:00P33.44040052983525495601[ ]A
2024-01-02 09:30:01.086662-05:00N33.46042948352983525028681[ , Q]A
2024-01-02 09:30:01.086708-05:00P33.45010052983525495863[ ]A
.....................
2024-05-14 15:59:59.976990-04:00N38.4904250152983576482130[ ]A
2024-05-14 15:59:59.977038-04:00N38.49510052983576482132[ ]A
2024-05-14 15:59:59.977065-04:00N38.49514052983576482133[ ]A
2024-05-14 15:59:59.977618-04:00T38.49514062883460167226[ ]A
2024-05-14 15:59:59.978364-04:00D38.49510071714533561907[ ]A
\n", "

6467196 rows × 6 columns

\n", "
" ], "text/plain": [ " exchange price size \\\n", "symbol timestamp \n", "BAC 2024-01-02 09:30:00.329729-05:00 T 33.390 9684 \n", " 2024-01-02 09:30:00.758040-05:00 P 33.430 700 \n", " 2024-01-02 09:30:00.758997-05:00 P 33.440 400 \n", " 2024-01-02 09:30:01.086662-05:00 N 33.460 429483 \n", " 2024-01-02 09:30:01.086708-05:00 P 33.450 100 \n", "... ... ... ... \n", " 2024-05-14 15:59:59.976990-04:00 N 38.490 42501 \n", " 2024-05-14 15:59:59.977038-04:00 N 38.495 100 \n", " 2024-05-14 15:59:59.977065-04:00 N 38.495 140 \n", " 2024-05-14 15:59:59.977618-04:00 T 38.495 140 \n", " 2024-05-14 15:59:59.978364-04:00 D 38.495 100 \n", "\n", " id conditions tape \n", "symbol timestamp \n", "BAC 2024-01-02 09:30:00.329729-05:00 62879149669684 [ , Q] A \n", " 2024-01-02 09:30:00.758040-05:00 52983525495600 [ ] A \n", " 2024-01-02 09:30:00.758997-05:00 52983525495601 [ ] A \n", " 2024-01-02 09:30:01.086662-05:00 52983525028681 [ , Q] A \n", " 2024-01-02 09:30:01.086708-05:00 52983525495863 [ ] A \n", "... ... ... ... \n", " 2024-05-14 15:59:59.976990-04:00 52983576482130 [ ] A \n", " 2024-05-14 15:59:59.977038-04:00 52983576482132 [ ] A \n", " 2024-05-14 15:59:59.977065-04:00 52983576482133 [ ] A \n", " 2024-05-14 15:59:59.977618-04:00 62883460167226 [ ] A \n", " 2024-05-14 15:59:59.978364-04:00 71714533561907 [ ] A \n", "\n", "[6467196 rows x 6 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trades_df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "file_trades" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#list all files is dir directory with parquet extension\n", "dir = DATA_DIR + \"/notebooks/\"\n", "import os\n", "files = [f for f in os.listdir(dir) if f.endswith(\".parquet\")]\n", "file_name = \"\"\n", "ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ohlcv_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "# Calculate daily returns\n", "ohlcv_df['returns'] = ohlcv_df['close'].pct_change().dropna()\n", "#same as above but pct_change is from 3 datapoints back, but only if it is the same date, else na\n", "\n", "\n", "# Plot the probability distribution curve\n", "plt.figure(figsize=(10, 6))\n", "sns.histplot(df['returns'].dropna(), kde=True, stat='probability', bins=30)\n", "plt.title('Probability Distribution of Daily Returns')\n", "plt.xlabel('Daily Returns')\n", "plt.ylabel('Probability')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "\n", "# Define the intervals from 5 to 20 s, returns for each interval\n", "#maybe use rolling window?\n", "intervals = range(5, 21, 5)\n", "\n", "# Create columns for percentage returns\n", "rolling_window = 50\n", "\n", "# Normalize the returns using rolling mean and std\n", "for N in intervals:\n", " column_name = f'returns_{N}'\n", " rolling_mean = ohlcv_df[column_name].rolling(window=rolling_window).mean()\n", " rolling_std = ohlcv_df[column_name].rolling(window=rolling_window).std()\n", " ohlcv_df[f'norm_{column_name}'] = (ohlcv_df[column_name] - rolling_mean) / rolling_std\n", "\n", "# Display the dataframe with normalized return columns\n", "ohlcv_df\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Calculate the sum of the normalized return columns for each row\n", "ohlcv_df['sum_norm_returns'] = ohlcv_df[[f'norm_returns_{N}' for N in intervals]].sum(axis=1)\n", "\n", "# Sort the DataFrame based on the sum of normalized returns in descending order\n", "df_sorted = ohlcv_df.sort_values(by='sum_norm_returns', ascending=False)\n", "\n", "# Display the top rows with the highest sum of normalized returns\n", "df_sorted\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Drop initial rows with NaN values due to pct_change\n", "ohlcv_df.dropna(inplace=True)\n", "\n", "# Plotting the probability distribution curves\n", "plt.figure(figsize=(14, 8))\n", "for N in intervals:\n", " sns.kdeplot(ohlcv_df[f'returns_{N}'].dropna(), label=f'Returns {N}', fill=True)\n", "\n", "plt.title('Probability Distribution of Percentage Returns')\n", "plt.xlabel('Percentage Return')\n", "plt.ylabel('Density')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "# Plot the probability distribution curve\n", "plt.figure(figsize=(10, 6))\n", "sns.histplot(ohlcv_df['returns'].dropna(), kde=True, stat='probability', bins=30)\n", "plt.title('Probability Distribution of Daily Returns')\n", "plt.xlabel('Daily Returns')\n", "plt.ylabel('Probability')\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#show only rows from ohlcv_df where returns > 0.005\n", "ohlcv_df[ohlcv_df['returns'] > 0.0005]\n", "\n", "#ohlcv_df[ohlcv_df['returns'] < -0.005]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#ohlcv where index = date 2024-03-13 and between hour 12\n", "\n", "a = ohlcv_df.loc['2024-03-13 12:00:00':'2024-03-13 13:00:00']\n", "a" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ohlcv_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trades_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ohlcv_df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trades_df.to_parquet(\"trades_df-spy-0111-0111.parquett\", engine='pyarrow', compression='gzip')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trades_df.to_parquet(\"trades_df-spy-111-0516.parquett\", engine='pyarrow', compression='gzip', allow_truncated_timestamps=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ohlcv_df.to_parquet(\"ohlcv_df-spy-111-0516.parquett\", engine='pyarrow', compression='gzip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)\n", "vbt.settings['plotting']['auto_rangebreaks'] = True\n", "basic_data.ohlcv.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#access just BCA\n", "#df_filtered = df.loc[\"BAC\"]" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 2 }