strategy-lab/research/data/prepare_aggregated_data.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create aggregated data from trades\n",
    "\n",
    "This is how new aggregated data are created and stored to cache, where can they be loaded. It is created for given symbol, interval and aggregation type/resolution. For example OHLCV_1m, or OHLCV_VOLUME_2000 (volume bars with resolution 2000).\n",
    "\n",
    "Possible aggregation types\n",
    "- time based OHLCV, time resolution\n",
    "- volume based OHLCV, volume resolution\n",
    "- dollar based OHLCV, dollar amount resolution\n",
    "- renko bars, bricks size as resolution\n",
    "\n",
    "\n",
    "Steps include\n",
    "- fetch trades (remote/cached)\n",
    "- use new vectorized aggregation to aggregate bars of given type (time, volume, dollar) and resolution\n",
    "- store to agg cache\n",
    "\n",
    "Methods:\n",
    "- `fetch_trades_parallel` enables to fetch trades of given symbol and interval, also can filter conditions and minimum size. Returns `trades_df`\n",
    "- `aggregate_trades` accepts `trades_df` and resolution and type of bars (VOLUME, TIME, DOLLAR) and return aggregated ohlcv dataframe `ohlcv_df`\n",
    "\n",
    "TBD will be soon introduced in separate package responsible for fetching the data (cache mngmt, remote fetching and vectorized aggregation) - see (issue)[https://github.com/drew2323/v2trading/issues/250]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n",
      "Loaded env variables from file None\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Activating profile profile1\n",
       "</pre>\n"
      ],
      "text/plain": [
       "Activating profile profile1\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trades_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet\n",
      "trades_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet\n",
      "trades_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet\n",
      "trades_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\n",
      "ohlcv_df-BAC-2024-01-11T09:30:00-2024-01-12T16:00:00.parquet\n",
      "trades_df-BAC-2023-01-01T09:30:00-2024-10-02T16:00:00-['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']-100.parquet\n",
      "trades_df-BAC-2024-05-15T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\n",
      "ohlcv_df-BAC-2023-01-01T09:30:00-2024-10-02T16:00:00-['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']-100.parquet\n",
      "ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\n",
      "ohlcv_df-SPY-2024-01-01T09:30:00-2024-05-14T16:00:00.parquet\n",
      "ohlcv_df-BAC-2024-01-01T09_30_00-2024-05-14T16_00_00-CO4B7VPWUZF-100.parquet\n",
      "ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T16_00_00-47BCFOPUVWZ-100.parquet\n",
      "ohlcv_df-BAC-2023-01-01T09_30_00-2024-05-25T15_30_00-47BCFOPUVWZ-100.parquet\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from numba import jit\n",
    "from alpaca.data.historical import StockHistoricalDataClient\n",
    "from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR\n",
    "from alpaca.data.requests import StockTradesRequest\n",
    "from v2realbot.enums.enums import BarType\n",
    "import time\n",
    "from datetime import datetime\n",
    "from v2realbot.utils.utils import parse_alpaca_timestamp, ltp, zoneNY, send_to_telegram, fetch_calendar_data\n",
    "import pyarrow\n",
    "from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades\n",
    "import vectorbtpro as vbt\n",
    "import v2realbot.utils.config_handler as cfh\n",
    "\n",
    "vbt.settings.set_theme(\"dark\")\n",
    "vbt.settings['plotting']['layout']['width'] = 1280\n",
    "vbt.settings.plotting.auto_rangebreaks = True\n",
    "# Set the option to display with pagination\n",
    "pd.set_option('display.notebook_repr_html', True)\n",
    "pd.set_option('display.max_rows', 20)  # Number of rows per page\n",
    "# pd.set_option('display.float_format', '{:.9f}'.format)\n",
    "\n",
    "\n",
    "#trade filtering\n",
    "exclude_conditions = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES') #standard ['C','O','4','B','7','V','P','W','U','Z','F']\n",
    "minsize = 100\n",
    "\n",
    "symbol = \"BAC\"\n",
    "#datetime in zoneNY \n",
    "day_start = datetime(2024, 10, 3, 9, 30, 0)\n",
    "day_stop = datetime(2024, 10, 16, 16, 00, 0)\n",
    "day_start = zoneNY.localize(day_start)\n",
    "day_stop = zoneNY.localize(day_stop)\n",
    "#filename of trades_df parquet, date are in isoformat but without time zone part\n",
    "dir = DATA_DIR + \"/notebooks/\"\n",
    "#parquet interval cache contains exclude conditions and minsize filtering\n",
    "file_trades = dir + f\"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}-{exclude_conditions}-{minsize}.parquet\"\n",
    "#file_trades = dir + f\"trades_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}.parquet\"\n",
    "file_ohlcv = dir + f\"ohlcv_df-{symbol}-{day_start.strftime('%Y-%m-%dT%H:%M:%S')}-{day_stop.strftime('%Y-%m-%dT%H:%M:%S')}-{str(exclude_conditions)}-{minsize}.parquet\"\n",
    "\n",
    "#PRINT all parquet in directory\n",
    "import os\n",
    "files = [f for f in os.listdir(dir) if f.endswith(\".parquet\")]\n",
    "for f in files:\n",
    "    print(f)\n",
    "\n",
    "exclude_conditions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Contains 10  market days\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing market days: 100%|██████████| 10/10 [00:00<00:00, 267.74it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NOT FOUND. Fetching from remote\n",
      "NOT FOUND. Fetching from remote\n",
      "NOT FOUND. Fetching from remote\n",
      "NOT FOUND. Fetching from remote\n",
      "NOT FOUND. Fetching from remote\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fetching data:   0%|          | 0/10 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Remote fetched: is_empty=False 2024-10-03 09:30:00-04:00 2024-10-03 16:00:00-04:00\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1727962200-1727985600.cache.gz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fetching data:  10%|█         | 1/10 [00:21<03:12, 21.41s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n",
      "minsize 100\n",
      "NOT FOUND. Fetching from remote\n",
      "Remote fetched: is_empty=False 2024-10-08 09:30:00-04:00 2024-10-08 16:00:00-04:00\n",
      "Remote fetched: is_empty=False 2024-10-09 09:30:00-04:00 2024-10-09 16:00:00-04:00\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1728394200-1728417600.cache.gz\n",
      "Remote fetched: is_empty=False 2024-10-07 09:30:00-04:00 2024-10-07 16:00:00-04:00\n",
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n",
      "minsize 100\n",
      "NOT FOUND. Fetching from remote\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1728480600-1728504000.cache.gz\n",
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n",
      "minsize 100\n",
      "NOT FOUND. Fetching from remote\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1728307800-1728331200.cache.gz\n",
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n",
      "minsize 100\n",
      "NOT FOUND. Fetching from remote\n",
      "Remote fetched: is_empty=False 2024-10-04 09:30:00-04:00 2024-10-04 16:00:00-04:00\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1728048600-1728072000.cache.gz\n",
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fetching data:  20%|██        | 2/10 [00:32<02:01, 15.24s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "minsize 100\n",
      "NOT FOUND. Fetching from remote\n",
      "Remote fetched: is_empty=False 2024-10-10 09:30:00-04:00 2024-10-10 16:00:00-04:00\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1728567000-1728590400.cache.gz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fetching data:  60%|██████    | 6/10 [00:47<00:25,  6.40s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n",
      "minsize 100\n",
      "Remote fetched: is_empty=False 2024-10-14 09:30:00-04:00 2024-10-14 16:00:00-04:00\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1728912600-1728936000.cache.gz\n",
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n",
      "minsize 100\n",
      "Remote fetched: is_empty=False 2024-10-16 09:30:00-04:00 2024-10-16 16:00:00-04:00\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1729085400-1729108800.cache.gz\n",
      "Remote fetched: is_empty=False 2024-10-11 09:30:00-04:00 2024-10-11 16:00:00-04:00\n",
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n",
      "minsize 100\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1728653400-1728676800.cache.gz\n",
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fetching data:  70%|███████   | 7/10 [01:13<00:31, 10.55s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "minsize 100\n",
      "Remote fetched: is_empty=False 2024-10-15 09:30:00-04:00 2024-10-15 16:00:00-04:00\n",
      "Saving to Trade CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/BAC-1728999000-1729022400.cache.gz\n",
      "excluding conditions ['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fetching data: 100%|██████████| 10/10 [01:25<00:00,  8.53s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "minsize 100\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from v2realbot.loader.aggregator_vectorized import fetch_daily_stock_trades, fetch_trades_parallel, generate_time_bars_nb, aggregate_trades, fetch_trades_parallel_optimized\n",
    "#fetch trades in one go\n",
    "#trades_df = fetch_daily_stock_trades(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=minsize, force_remote=False, max_retries=5, backoff_factor=1)\n",
    "#fetch trades in parallel - for longer intervals\n",
    "trades_df = fetch_trades_parallel(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=minsize, force_remote=True, max_workers=None)\n",
    " \n",
    "##trades_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Either load trades or ohlcv from parquet if exists\n",
    "\n",
    "#trades_df = fetch_trades_parallel(symbol, day_start, day_stop, exclude_conditions=exclude_conditions, minsize=50, max_workers=20) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])\n",
    "# trades_df.to_parquet(file_trades, engine='pyarrow', compression='gzip')\n",
    "\n",
    "trades_df = pd.read_parquet(file_trades,engine='pyarrow')\n",
    "ohlcv_df = aggregate_trades(symbol=symbol, trades_df=trades_df, resolution=1, type=BarType.TIME)\n",
    "ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow', compression='gzip')\n",
    "\n",
    "# ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')\n",
    "# trades_df = pd.read_parquet(file_trades,engine='pyarrow')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#list all files is dir directory with parquet extension\n",
    "dir = DATA_DIR + \"/notebooks/\"\n",
    "import os\n",
    "files = [f for f in os.listdir(dir) if f.endswith(\".parquet\")]\n",
    "file_name = \"\"\n",
    "ohlcv_df = pd.read_parquet(file_ohlcv,engine='pyarrow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"/Users/davidbrazda/Library/Application Support/v2realbot/notebooks/ohlcv_df-BAC-2024-10-03T09:30:00-2024-10-16T16:00:00-['4', '7', 'B', 'C', 'F', 'O', 'P', 'U', 'V', 'W', 'Z']-100.parquet\""
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_ohlcv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>open</th>\n",
       "      <th>high</th>\n",
       "      <th>low</th>\n",
       "      <th>close</th>\n",
       "      <th>volume</th>\n",
       "      <th>trades</th>\n",
       "      <th>updated</th>\n",
       "      <th>vwap</th>\n",
       "      <th>buyvolume</th>\n",
       "      <th>sellvolume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>time</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2024-10-03 09:30:00-04:00</th>\n",
       "      <td>38.9800</td>\n",
       "      <td>39.0000</td>\n",
       "      <td>38.940</td>\n",
       "      <td>38.970</td>\n",
       "      <td>249774.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2024-10-03 09:30:01.061997-04:00</td>\n",
       "      <td>38.960055</td>\n",
       "      <td>500.0</td>\n",
       "      <td>249088.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-03 09:30:01-04:00</th>\n",
       "      <td>38.9500</td>\n",
       "      <td>39.0001</td>\n",
       "      <td>38.950</td>\n",
       "      <td>39.000</td>\n",
       "      <td>13553.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>2024-10-03 09:30:02.171691-04:00</td>\n",
       "      <td>38.985179</td>\n",
       "      <td>2133.0</td>\n",
       "      <td>1894.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-03 09:30:02-04:00</th>\n",
       "      <td>38.9992</td>\n",
       "      <td>39.0100</td>\n",
       "      <td>38.990</td>\n",
       "      <td>39.010</td>\n",
       "      <td>4600.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>2024-10-03 09:30:03.091339-04:00</td>\n",
       "      <td>39.000123</td>\n",
       "      <td>1031.0</td>\n",
       "      <td>797.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-03 09:30:03-04:00</th>\n",
       "      <td>38.9900</td>\n",
       "      <td>39.0400</td>\n",
       "      <td>38.990</td>\n",
       "      <td>39.030</td>\n",
       "      <td>7533.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>2024-10-03 09:30:04.193646-04:00</td>\n",
       "      <td>39.030827</td>\n",
       "      <td>1733.0</td>\n",
       "      <td>713.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-03 09:30:04-04:00</th>\n",
       "      <td>39.0320</td>\n",
       "      <td>39.0350</td>\n",
       "      <td>39.032</td>\n",
       "      <td>39.035</td>\n",
       "      <td>9142.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2024-10-03 09:30:07.260896-04:00</td>\n",
       "      <td>39.032033</td>\n",
       "      <td>9142.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-16 15:59:55-04:00</th>\n",
       "      <td>42.8100</td>\n",
       "      <td>42.8100</td>\n",
       "      <td>42.810</td>\n",
       "      <td>42.810</td>\n",
       "      <td>8681.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>2024-10-16 15:59:56.000104-04:00</td>\n",
       "      <td>42.810000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-16 15:59:56-04:00</th>\n",
       "      <td>42.8150</td>\n",
       "      <td>42.8150</td>\n",
       "      <td>42.810</td>\n",
       "      <td>42.810</td>\n",
       "      <td>4128.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2024-10-16 15:59:57.010896-04:00</td>\n",
       "      <td>42.811550</td>\n",
       "      <td>1100.0</td>\n",
       "      <td>603.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-16 15:59:57-04:00</th>\n",
       "      <td>42.8150</td>\n",
       "      <td>42.8150</td>\n",
       "      <td>42.810</td>\n",
       "      <td>42.810</td>\n",
       "      <td>5301.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>2024-10-16 15:59:58.006387-04:00</td>\n",
       "      <td>42.812493</td>\n",
       "      <td>789.0</td>\n",
       "      <td>1708.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-16 15:59:58-04:00</th>\n",
       "      <td>42.8160</td>\n",
       "      <td>42.8200</td>\n",
       "      <td>42.800</td>\n",
       "      <td>42.800</td>\n",
       "      <td>21469.0</td>\n",
       "      <td>33.0</td>\n",
       "      <td>2024-10-16 15:59:59.088188-04:00</td>\n",
       "      <td>42.809572</td>\n",
       "      <td>542.0</td>\n",
       "      <td>632.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2024-10-16 15:59:59-04:00</th>\n",
       "      <td>42.8087</td>\n",
       "      <td>42.8100</td>\n",
       "      <td>42.800</td>\n",
       "      <td>42.810</td>\n",
       "      <td>26899.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>2024-10-16 15:59:59.997799-04:00</td>\n",
       "      <td>42.801563</td>\n",
       "      <td>4757.0</td>\n",
       "      <td>16482.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>114097 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                              open     high     low   close    volume  trades  \\\n",
       "time                                                                            \n",
       "2024-10-03 09:30:00-04:00  38.9800  39.0000  38.940  38.970  249774.0     6.0   \n",
       "2024-10-03 09:30:01-04:00  38.9500  39.0001  38.950  39.000   13553.0    44.0   \n",
       "2024-10-03 09:30:02-04:00  38.9992  39.0100  38.990  39.010    4600.0    20.0   \n",
       "2024-10-03 09:30:03-04:00  38.9900  39.0400  38.990  39.030    7533.0    36.0   \n",
       "2024-10-03 09:30:04-04:00  39.0320  39.0350  39.032  39.035    9142.0     2.0   \n",
       "...                            ...      ...     ...     ...       ...     ...   \n",
       "2024-10-16 15:59:55-04:00  42.8100  42.8100  42.810  42.810    8681.0    22.0   \n",
       "2024-10-16 15:59:56-04:00  42.8150  42.8150  42.810  42.810    4128.0     9.0   \n",
       "2024-10-16 15:59:57-04:00  42.8150  42.8150  42.810  42.810    5301.0    20.0   \n",
       "2024-10-16 15:59:58-04:00  42.8160  42.8200  42.800  42.800   21469.0    33.0   \n",
       "2024-10-16 15:59:59-04:00  42.8087  42.8100  42.800  42.810   26899.0    16.0   \n",
       "\n",
       "                                                   updated       vwap  \\\n",
       "time                                                                    \n",
       "2024-10-03 09:30:00-04:00 2024-10-03 09:30:01.061997-04:00  38.960055   \n",
       "2024-10-03 09:30:01-04:00 2024-10-03 09:30:02.171691-04:00  38.985179   \n",
       "2024-10-03 09:30:02-04:00 2024-10-03 09:30:03.091339-04:00  39.000123   \n",
       "2024-10-03 09:30:03-04:00 2024-10-03 09:30:04.193646-04:00  39.030827   \n",
       "2024-10-03 09:30:04-04:00 2024-10-03 09:30:07.260896-04:00  39.032033   \n",
       "...                                                    ...        ...   \n",
       "2024-10-16 15:59:55-04:00 2024-10-16 15:59:56.000104-04:00  42.810000   \n",
       "2024-10-16 15:59:56-04:00 2024-10-16 15:59:57.010896-04:00  42.811550   \n",
       "2024-10-16 15:59:57-04:00 2024-10-16 15:59:58.006387-04:00  42.812493   \n",
       "2024-10-16 15:59:58-04:00 2024-10-16 15:59:59.088188-04:00  42.809572   \n",
       "2024-10-16 15:59:59-04:00 2024-10-16 15:59:59.997799-04:00  42.801563   \n",
       "\n",
       "                           buyvolume  sellvolume  \n",
       "time                                              \n",
       "2024-10-03 09:30:00-04:00      500.0    249088.0  \n",
       "2024-10-03 09:30:01-04:00     2133.0      1894.0  \n",
       "2024-10-03 09:30:02-04:00     1031.0       797.0  \n",
       "2024-10-03 09:30:03-04:00     1733.0       713.0  \n",
       "2024-10-03 09:30:04-04:00     9142.0         0.0  \n",
       "...                              ...         ...  \n",
       "2024-10-16 15:59:55-04:00        0.0         0.0  \n",
       "2024-10-16 15:59:56-04:00     1100.0       603.0  \n",
       "2024-10-16 15:59:57-04:00      789.0      1708.0  \n",
       "2024-10-16 15:59:58-04:00      542.0       632.0  \n",
       "2024-10-16 15:59:59-04:00     4757.0     16482.0  \n",
       "\n",
       "[114097 rows x 10 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ohlcv_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "# Calculate daily returns\n",
    "ohlcv_df['returns'] = ohlcv_df['close'].pct_change().dropna()\n",
    "#same as above but pct_change is from 3 datapoints back, but only if it is the same date, else na\n",
    "\n",
    "\n",
    "# Plot the probability distribution curve\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.histplot(df['returns'].dropna(), kde=True, stat='probability', bins=30)\n",
    "plt.title('Probability Distribution of Daily Returns')\n",
    "plt.xlabel('Daily Returns')\n",
    "plt.ylabel('Probability')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "# Define the intervals from 5 to 20 s, returns for each interval\n",
    "#maybe use rolling window?\n",
    "intervals = range(5, 21, 5)\n",
    "\n",
    "# Create columns for percentage returns\n",
    "rolling_window = 50\n",
    "\n",
    "# Normalize the returns using rolling mean and std\n",
    "for N in intervals:\n",
    "    column_name = f'returns_{N}'\n",
    "    rolling_mean = ohlcv_df[column_name].rolling(window=rolling_window).mean()\n",
    "    rolling_std = ohlcv_df[column_name].rolling(window=rolling_window).std()\n",
    "    ohlcv_df[f'norm_{column_name}'] = (ohlcv_df[column_name] - rolling_mean) / rolling_std\n",
    "\n",
    "# Display the dataframe with normalized return columns\n",
    "ohlcv_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the sum of the normalized return columns for each row\n",
    "ohlcv_df['sum_norm_returns'] = ohlcv_df[[f'norm_returns_{N}' for N in intervals]].sum(axis=1)\n",
    "\n",
    "# Sort the DataFrame based on the sum of normalized returns in descending order\n",
    "df_sorted = ohlcv_df.sort_values(by='sum_norm_returns', ascending=False)\n",
    "\n",
    "# Display the top rows with the highest sum of normalized returns\n",
    "df_sorted\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop initial rows with NaN values due to pct_change\n",
    "ohlcv_df.dropna(inplace=True)\n",
    "\n",
    "# Plotting the probability distribution curves\n",
    "plt.figure(figsize=(14, 8))\n",
    "for N in intervals:\n",
    "    sns.kdeplot(ohlcv_df[f'returns_{N}'].dropna(), label=f'Returns {N}', fill=True)\n",
    "\n",
    "plt.title('Probability Distribution of Percentage Returns')\n",
    "plt.xlabel('Percentage Return')\n",
    "plt.ylabel('Density')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "# Plot the probability distribution curve\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.histplot(ohlcv_df['returns'].dropna(), kde=True, stat='probability', bins=30)\n",
    "plt.title('Probability Distribution of Daily Returns')\n",
    "plt.xlabel('Daily Returns')\n",
    "plt.ylabel('Probability')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#show only rows from ohlcv_df where returns > 0.005\n",
    "ohlcv_df[ohlcv_df['returns'] > 0.0005]\n",
    "\n",
    "#ohlcv_df[ohlcv_df['returns'] < -0.005]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ohlcv where index = date 2024-03-13 and between hour 12\n",
    "\n",
    "a = ohlcv_df.loc['2024-03-13 12:00:00':'2024-03-13 13:00:00']\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ohlcv_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trades_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ohlcv_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trades_df.to_parquet(\"trades_df-spy-0111-0111.parquett\", engine='pyarrow', compression='gzip')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trades_df.to_parquet(\"trades_df-spy-111-0516.parquett\", engine='pyarrow', compression='gzip', allow_truncated_timestamps=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ohlcv_df.to_parquet(\"ohlcv_df-spy-111-0516.parquett\", engine='pyarrow', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "basic_data = vbt.Data.from_data(vbt.symbol_dict({symbol: ohlcv_df}), tz_convert=zoneNY)\n",
    "vbt.settings['plotting']['auto_rangebreaks'] = True\n",
    "basic_data.ohlcv.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#access just BCA\n",
    "#df_filtered = df.loc[\"BAC\"]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}