optimalizations
This commit is contained in:
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='ttools',
|
||||
version='0.6.4',
|
||||
version='0.7.0',
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
# list your dependencies here
|
||||
|
||||
@ -40,7 +40,7 @@
|
||||
"from ttools.utils import AggType\n",
|
||||
"from datetime import datetime\n",
|
||||
"from ttools.aggregator_vectorized import generate_time_bars_nb, aggregate_trades\n",
|
||||
"from ttools.loaders import load_data, prepare_trade_cache\n",
|
||||
"from ttools.loaders import load_data, prepare_trade_cache, fetch_daily_stock_trades\n",
|
||||
"from ttools.utils import zoneNY\n",
|
||||
"import vectorbtpro as vbt\n",
|
||||
"from lightweight_charts import PlotDFAccessor, PlotSRAccessor\n",
|
||||
@ -69,7 +69,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -110,44 +110,44 @@
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-02-15 09:30:00-05:00</th>\n",
|
||||
" <td>499.29</td>\n",
|
||||
" <td>499.41</td>\n",
|
||||
" <td>499.2900</td>\n",
|
||||
" <td>499.3200</td>\n",
|
||||
" <td>161900.0</td>\n",
|
||||
" <th>2024-09-16 04:01:24-04:00</th>\n",
|
||||
" <td>562.22</td>\n",
|
||||
" <td>562.22</td>\n",
|
||||
" <td>562.22</td>\n",
|
||||
" <td>562.22</td>\n",
|
||||
" <td>200.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-02-15 09:30:01-05:00</th>\n",
|
||||
" <td>499.32</td>\n",
|
||||
" <td>499.41</td>\n",
|
||||
" <td>499.3000</td>\n",
|
||||
" <td>499.4000</td>\n",
|
||||
" <td>10900.0</td>\n",
|
||||
" <th>2024-09-16 04:02:24-04:00</th>\n",
|
||||
" <td>562.17</td>\n",
|
||||
" <td>562.17</td>\n",
|
||||
" <td>562.17</td>\n",
|
||||
" <td>562.17</td>\n",
|
||||
" <td>293.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-02-15 09:30:02-05:00</th>\n",
|
||||
" <td>499.36</td>\n",
|
||||
" <td>499.40</td>\n",
|
||||
" <td>499.3550</td>\n",
|
||||
" <td>499.3800</td>\n",
|
||||
" <td>7040.0</td>\n",
|
||||
" <th>2024-09-16 04:04:36-04:00</th>\n",
|
||||
" <td>562.54</td>\n",
|
||||
" <td>562.54</td>\n",
|
||||
" <td>562.54</td>\n",
|
||||
" <td>562.54</td>\n",
|
||||
" <td>100.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-02-15 09:30:03-05:00</th>\n",
|
||||
" <td>499.39</td>\n",
|
||||
" <td>499.42</td>\n",
|
||||
" <td>499.3800</td>\n",
|
||||
" <td>499.4000</td>\n",
|
||||
" <td>8717.0</td>\n",
|
||||
" <th>2024-09-16 04:10:00-04:00</th>\n",
|
||||
" <td>562.39</td>\n",
|
||||
" <td>562.39</td>\n",
|
||||
" <td>562.39</td>\n",
|
||||
" <td>562.39</td>\n",
|
||||
" <td>102.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-02-15 09:30:04-05:00</th>\n",
|
||||
" <td>499.40</td>\n",
|
||||
" <td>499.40</td>\n",
|
||||
" <td>499.3500</td>\n",
|
||||
" <td>499.3500</td>\n",
|
||||
" <td>3265.0</td>\n",
|
||||
" <th>2024-09-16 04:10:24-04:00</th>\n",
|
||||
" <td>562.44</td>\n",
|
||||
" <td>562.44</td>\n",
|
||||
" <td>562.44</td>\n",
|
||||
" <td>562.44</td>\n",
|
||||
" <td>371.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
@ -158,69 +158,69 @@
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-03-18 15:59:55-04:00</th>\n",
|
||||
" <td>512.94</td>\n",
|
||||
" <td>512.94</td>\n",
|
||||
" <td>512.8600</td>\n",
|
||||
" <td>512.8900</td>\n",
|
||||
" <td>7345.0</td>\n",
|
||||
" <th>2024-10-18 19:57:24-04:00</th>\n",
|
||||
" <td>584.80</td>\n",
|
||||
" <td>584.80</td>\n",
|
||||
" <td>584.80</td>\n",
|
||||
" <td>584.80</td>\n",
|
||||
" <td>100.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-03-18 15:59:56-04:00</th>\n",
|
||||
" <td>512.90</td>\n",
|
||||
" <td>512.90</td>\n",
|
||||
" <td>512.8700</td>\n",
|
||||
" <td>512.8800</td>\n",
|
||||
" <td>2551.0</td>\n",
|
||||
" <th>2024-10-18 19:57:48-04:00</th>\n",
|
||||
" <td>584.84</td>\n",
|
||||
" <td>584.84</td>\n",
|
||||
" <td>584.84</td>\n",
|
||||
" <td>584.84</td>\n",
|
||||
" <td>622.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-03-18 15:59:57-04:00</th>\n",
|
||||
" <td>512.89</td>\n",
|
||||
" <td>512.91</td>\n",
|
||||
" <td>512.8500</td>\n",
|
||||
" <td>512.8701</td>\n",
|
||||
" <td>18063.0</td>\n",
|
||||
" <th>2024-10-18 19:58:48-04:00</th>\n",
|
||||
" <td>584.77</td>\n",
|
||||
" <td>584.79</td>\n",
|
||||
" <td>584.77</td>\n",
|
||||
" <td>584.79</td>\n",
|
||||
" <td>4158.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-03-18 15:59:58-04:00</th>\n",
|
||||
" <td>512.87</td>\n",
|
||||
" <td>512.90</td>\n",
|
||||
" <td>512.8496</td>\n",
|
||||
" <td>512.9000</td>\n",
|
||||
" <td>7734.0</td>\n",
|
||||
" <th>2024-10-18 19:59:36-04:00</th>\n",
|
||||
" <td>584.80</td>\n",
|
||||
" <td>584.82</td>\n",
|
||||
" <td>584.80</td>\n",
|
||||
" <td>584.82</td>\n",
|
||||
" <td>298.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-03-18 15:59:59-04:00</th>\n",
|
||||
" <td>512.92</td>\n",
|
||||
" <td>512.92</td>\n",
|
||||
" <td>512.8200</td>\n",
|
||||
" <td>512.8700</td>\n",
|
||||
" <td>37159.0</td>\n",
|
||||
" <th>2024-10-18 19:59:48-04:00</th>\n",
|
||||
" <td>584.76</td>\n",
|
||||
" <td>584.76</td>\n",
|
||||
" <td>584.72</td>\n",
|
||||
" <td>584.72</td>\n",
|
||||
" <td>258.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>417345 rows × 5 columns</p>\n",
|
||||
"<p>64218 rows × 5 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" open high low close volume\n",
|
||||
"time \n",
|
||||
"2024-02-15 09:30:00-05:00 499.29 499.41 499.2900 499.3200 161900.0\n",
|
||||
"2024-02-15 09:30:01-05:00 499.32 499.41 499.3000 499.4000 10900.0\n",
|
||||
"2024-02-15 09:30:02-05:00 499.36 499.40 499.3550 499.3800 7040.0\n",
|
||||
"2024-02-15 09:30:03-05:00 499.39 499.42 499.3800 499.4000 8717.0\n",
|
||||
"2024-02-15 09:30:04-05:00 499.40 499.40 499.3500 499.3500 3265.0\n",
|
||||
"... ... ... ... ... ...\n",
|
||||
"2024-03-18 15:59:55-04:00 512.94 512.94 512.8600 512.8900 7345.0\n",
|
||||
"2024-03-18 15:59:56-04:00 512.90 512.90 512.8700 512.8800 2551.0\n",
|
||||
"2024-03-18 15:59:57-04:00 512.89 512.91 512.8500 512.8701 18063.0\n",
|
||||
"2024-03-18 15:59:58-04:00 512.87 512.90 512.8496 512.9000 7734.0\n",
|
||||
"2024-03-18 15:59:59-04:00 512.92 512.92 512.8200 512.8700 37159.0\n",
|
||||
" open high low close volume\n",
|
||||
"time \n",
|
||||
"2024-09-16 04:01:24-04:00 562.22 562.22 562.22 562.22 200.0\n",
|
||||
"2024-09-16 04:02:24-04:00 562.17 562.17 562.17 562.17 293.0\n",
|
||||
"2024-09-16 04:04:36-04:00 562.54 562.54 562.54 562.54 100.0\n",
|
||||
"2024-09-16 04:10:00-04:00 562.39 562.39 562.39 562.39 102.0\n",
|
||||
"2024-09-16 04:10:24-04:00 562.44 562.44 562.44 562.44 371.0\n",
|
||||
"... ... ... ... ... ...\n",
|
||||
"2024-10-18 19:57:24-04:00 584.80 584.80 584.80 584.80 100.0\n",
|
||||
"2024-10-18 19:57:48-04:00 584.84 584.84 584.84 584.84 622.0\n",
|
||||
"2024-10-18 19:58:48-04:00 584.77 584.79 584.77 584.79 4158.0\n",
|
||||
"2024-10-18 19:59:36-04:00 584.80 584.82 584.80 584.82 298.0\n",
|
||||
"2024-10-18 19:59:48-04:00 584.76 584.76 584.72 584.72 258.0\n",
|
||||
"\n",
|
||||
"[417345 rows x 5 columns]"
|
||||
"[64218 rows x 5 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -229,17 +229,17 @@
|
||||
"#This is how to call LOAD function\n",
|
||||
"symbol = [\"SPY\"]\n",
|
||||
"#datetime in zoneNY \n",
|
||||
"day_start = datetime(2024, 2, 15, 9, 30, 0)\n",
|
||||
"day_stop = datetime(2024, 3, 18, 16, 0, 0)\n",
|
||||
"day_start = datetime(2024, 9, 15, 9, 30, 0)\n",
|
||||
"day_stop = datetime(2024, 10, 20, 16, 0, 0)\n",
|
||||
"day_start = zoneNY.localize(day_start)\n",
|
||||
"day_stop = zoneNY.localize(day_stop)\n",
|
||||
"\n",
|
||||
"#requested AGG\n",
|
||||
"resolution = 1 #12s bars\n",
|
||||
"resolution = 12 #12s bars\n",
|
||||
"agg_type = AggType.OHLCV #other types AggType.OHLCV_VOL, AggType.OHLCV_DOL, AggType.OHLCV_RENKO\n",
|
||||
"exclude_conditions = ['C','O','4','B','7','V','P','W','U','Z','F','9','M','6'] #None to defaults\n",
|
||||
"minsize = 100 #min trade size to include\n",
|
||||
"main_session_only = True\n",
|
||||
"main_session_only = False\n",
|
||||
"force_remote = False\n",
|
||||
"\n",
|
||||
"data = load_data(symbol = symbol,\n",
|
||||
@ -260,162 +260,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>open</th>\n",
|
||||
" <th>high</th>\n",
|
||||
" <th>low</th>\n",
|
||||
" <th>close</th>\n",
|
||||
" <th>volume</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>time</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-14 09:45:00-04:00</th>\n",
|
||||
" <td>41.9650</td>\n",
|
||||
" <td>41.970</td>\n",
|
||||
" <td>41.950</td>\n",
|
||||
" <td>41.9500</td>\n",
|
||||
" <td>17895.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-14 09:45:12-04:00</th>\n",
|
||||
" <td>41.9589</td>\n",
|
||||
" <td>41.965</td>\n",
|
||||
" <td>41.950</td>\n",
|
||||
" <td>41.9650</td>\n",
|
||||
" <td>6281.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-14 09:45:24-04:00</th>\n",
|
||||
" <td>41.9650</td>\n",
|
||||
" <td>42.005</td>\n",
|
||||
" <td>41.965</td>\n",
|
||||
" <td>41.9975</td>\n",
|
||||
" <td>3522.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-14 09:45:36-04:00</th>\n",
|
||||
" <td>41.9900</td>\n",
|
||||
" <td>42.005</td>\n",
|
||||
" <td>41.990</td>\n",
|
||||
" <td>42.0000</td>\n",
|
||||
" <td>5960.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-14 09:45:48-04:00</th>\n",
|
||||
" <td>42.0050</td>\n",
|
||||
" <td>42.040</td>\n",
|
||||
" <td>42.005</td>\n",
|
||||
" <td>42.0300</td>\n",
|
||||
" <td>9113.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-16 15:00:00-04:00</th>\n",
|
||||
" <td>42.9150</td>\n",
|
||||
" <td>42.915</td>\n",
|
||||
" <td>42.910</td>\n",
|
||||
" <td>42.9100</td>\n",
|
||||
" <td>12872.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-16 15:00:12-04:00</th>\n",
|
||||
" <td>42.9150</td>\n",
|
||||
" <td>42.920</td>\n",
|
||||
" <td>42.910</td>\n",
|
||||
" <td>42.9200</td>\n",
|
||||
" <td>7574.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-16 15:00:24-04:00</th>\n",
|
||||
" <td>42.9200</td>\n",
|
||||
" <td>42.920</td>\n",
|
||||
" <td>42.910</td>\n",
|
||||
" <td>42.9200</td>\n",
|
||||
" <td>1769.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-16 15:00:36-04:00</th>\n",
|
||||
" <td>42.9200</td>\n",
|
||||
" <td>42.920</td>\n",
|
||||
" <td>42.905</td>\n",
|
||||
" <td>42.9050</td>\n",
|
||||
" <td>26599.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-16 15:00:48-04:00</th>\n",
|
||||
" <td>42.9050</td>\n",
|
||||
" <td>42.905</td>\n",
|
||||
" <td>42.880</td>\n",
|
||||
" <td>42.8800</td>\n",
|
||||
" <td>9216.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5480 rows × 5 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" open high low close volume\n",
|
||||
"time \n",
|
||||
"2024-10-14 09:45:00-04:00 41.9650 41.970 41.950 41.9500 17895.0\n",
|
||||
"2024-10-14 09:45:12-04:00 41.9589 41.965 41.950 41.9650 6281.0\n",
|
||||
"2024-10-14 09:45:24-04:00 41.9650 42.005 41.965 41.9975 3522.0\n",
|
||||
"2024-10-14 09:45:36-04:00 41.9900 42.005 41.990 42.0000 5960.0\n",
|
||||
"2024-10-14 09:45:48-04:00 42.0050 42.040 42.005 42.0300 9113.0\n",
|
||||
"... ... ... ... ... ...\n",
|
||||
"2024-10-16 15:00:00-04:00 42.9150 42.915 42.910 42.9100 12872.0\n",
|
||||
"2024-10-16 15:00:12-04:00 42.9150 42.920 42.910 42.9200 7574.0\n",
|
||||
"2024-10-16 15:00:24-04:00 42.9200 42.920 42.910 42.9200 1769.0\n",
|
||||
"2024-10-16 15:00:36-04:00 42.9200 42.920 42.905 42.9050 26599.0\n",
|
||||
"2024-10-16 15:00:48-04:00 42.9050 42.905 42.880 42.8800 9216.0\n",
|
||||
"\n",
|
||||
"[5480 rows x 5 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data.ohlcv.data[symbol[0]]"
|
||||
]
|
||||
@ -478,26 +325,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"File: SPY-AggType.OHLCV-12-2024-01-15T09-30-00-2024-10-20T16-00-00-4679BCFMOPUVWZ-100-True.parquet\n",
|
||||
"Coverage: 2024-01-15 09:30:00 to 2024-10-20 16:00:00\n",
|
||||
"Symbol: SPY\n",
|
||||
"Agg Type: AggType.OHLCV\n",
|
||||
"Resolution: 12\n",
|
||||
"Excludes: 4679BCFMOPUVWZ\n",
|
||||
"Minsize: 100\n",
|
||||
"Main Session Only: True\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from ttools.utils import list_matching_files, print_matching_files_info, zoneNY\n",
|
||||
"from datetime import datetime\n",
|
||||
@ -533,261 +363,16 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And date subset loaded from parquet. Usually this is all done yb `load_data` in loader."
|
||||
"From this file the subset of dates are loaded. Usually this is all done automatically by `load_data` in loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>open</th>\n",
|
||||
" <th>high</th>\n",
|
||||
" <th>low</th>\n",
|
||||
" <th>close</th>\n",
|
||||
" <th>volume</th>\n",
|
||||
" <th>trades</th>\n",
|
||||
" <th>updated</th>\n",
|
||||
" <th>vwap</th>\n",
|
||||
" <th>buyvolume</th>\n",
|
||||
" <th>sellvolume</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>time</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-01-16 09:30:00-05:00</th>\n",
|
||||
" <td>475.250</td>\n",
|
||||
" <td>475.3600</td>\n",
|
||||
" <td>475.20</td>\n",
|
||||
" <td>475.285</td>\n",
|
||||
" <td>255386.0</td>\n",
|
||||
" <td>93.0</td>\n",
|
||||
" <td>2024-01-16 09:30:01.002183-05:00</td>\n",
|
||||
" <td>475.251725</td>\n",
|
||||
" <td>3692.0</td>\n",
|
||||
" <td>242756.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-01-16 09:30:01-05:00</th>\n",
|
||||
" <td>475.335</td>\n",
|
||||
" <td>475.3350</td>\n",
|
||||
" <td>475.23</td>\n",
|
||||
" <td>475.260</td>\n",
|
||||
" <td>15161.0</td>\n",
|
||||
" <td>100.0</td>\n",
|
||||
" <td>2024-01-16 09:30:02.007313-05:00</td>\n",
|
||||
" <td>475.283390</td>\n",
|
||||
" <td>4386.0</td>\n",
|
||||
" <td>4944.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-01-16 09:30:02-05:00</th>\n",
|
||||
" <td>475.250</td>\n",
|
||||
" <td>475.3000</td>\n",
|
||||
" <td>475.24</td>\n",
|
||||
" <td>475.300</td>\n",
|
||||
" <td>6993.0</td>\n",
|
||||
" <td>39.0</td>\n",
|
||||
" <td>2024-01-16 09:30:03.008912-05:00</td>\n",
|
||||
" <td>475.262507</td>\n",
|
||||
" <td>1900.0</td>\n",
|
||||
" <td>2256.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-01-16 09:30:03-05:00</th>\n",
|
||||
" <td>475.290</td>\n",
|
||||
" <td>475.3200</td>\n",
|
||||
" <td>475.24</td>\n",
|
||||
" <td>475.270</td>\n",
|
||||
" <td>8497.0</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" <td>2024-01-16 09:30:04.201093-05:00</td>\n",
|
||||
" <td>475.275280</td>\n",
|
||||
" <td>1300.0</td>\n",
|
||||
" <td>3200.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-01-16 09:30:04-05:00</th>\n",
|
||||
" <td>475.250</td>\n",
|
||||
" <td>475.2700</td>\n",
|
||||
" <td>475.22</td>\n",
|
||||
" <td>475.270</td>\n",
|
||||
" <td>5367.0</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" <td>2024-01-16 09:30:05.004980-05:00</td>\n",
|
||||
" <td>475.234353</td>\n",
|
||||
" <td>1613.0</td>\n",
|
||||
" <td>1247.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-18 15:59:55-04:00</th>\n",
|
||||
" <td>584.520</td>\n",
|
||||
" <td>584.5800</td>\n",
|
||||
" <td>584.51</td>\n",
|
||||
" <td>584.580</td>\n",
|
||||
" <td>10357.0</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" <td>2024-10-18 15:59:56.008928-04:00</td>\n",
|
||||
" <td>584.543870</td>\n",
|
||||
" <td>1600.0</td>\n",
|
||||
" <td>1100.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-18 15:59:56-04:00</th>\n",
|
||||
" <td>584.570</td>\n",
|
||||
" <td>584.6091</td>\n",
|
||||
" <td>584.55</td>\n",
|
||||
" <td>584.550</td>\n",
|
||||
" <td>6527.0</td>\n",
|
||||
" <td>32.0</td>\n",
|
||||
" <td>2024-10-18 15:59:57.007658-04:00</td>\n",
|
||||
" <td>584.566643</td>\n",
|
||||
" <td>1525.0</td>\n",
|
||||
" <td>1002.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-18 15:59:57-04:00</th>\n",
|
||||
" <td>584.560</td>\n",
|
||||
" <td>584.6100</td>\n",
|
||||
" <td>584.56</td>\n",
|
||||
" <td>584.600</td>\n",
|
||||
" <td>5068.0</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>2024-10-18 15:59:58.000435-04:00</td>\n",
|
||||
" <td>584.596249</td>\n",
|
||||
" <td>1960.0</td>\n",
|
||||
" <td>900.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-18 15:59:58-04:00</th>\n",
|
||||
" <td>584.590</td>\n",
|
||||
" <td>584.6200</td>\n",
|
||||
" <td>584.56</td>\n",
|
||||
" <td>584.560</td>\n",
|
||||
" <td>8786.0</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>2024-10-18 15:59:59.041984-04:00</td>\n",
|
||||
" <td>584.592217</td>\n",
|
||||
" <td>2859.0</td>\n",
|
||||
" <td>3921.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2024-10-18 15:59:59-04:00</th>\n",
|
||||
" <td>584.560</td>\n",
|
||||
" <td>584.6100</td>\n",
|
||||
" <td>584.56</td>\n",
|
||||
" <td>584.570</td>\n",
|
||||
" <td>12583.0</td>\n",
|
||||
" <td>69.0</td>\n",
|
||||
" <td>2024-10-18 15:59:59.982132-04:00</td>\n",
|
||||
" <td>584.583131</td>\n",
|
||||
" <td>5303.0</td>\n",
|
||||
" <td>1980.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>3384529 rows × 10 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" open high low close volume \\\n",
|
||||
"time \n",
|
||||
"2024-01-16 09:30:00-05:00 475.250 475.3600 475.20 475.285 255386.0 \n",
|
||||
"2024-01-16 09:30:01-05:00 475.335 475.3350 475.23 475.260 15161.0 \n",
|
||||
"2024-01-16 09:30:02-05:00 475.250 475.3000 475.24 475.300 6993.0 \n",
|
||||
"2024-01-16 09:30:03-05:00 475.290 475.3200 475.24 475.270 8497.0 \n",
|
||||
"2024-01-16 09:30:04-05:00 475.250 475.2700 475.22 475.270 5367.0 \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"2024-10-18 15:59:55-04:00 584.520 584.5800 584.51 584.580 10357.0 \n",
|
||||
"2024-10-18 15:59:56-04:00 584.570 584.6091 584.55 584.550 6527.0 \n",
|
||||
"2024-10-18 15:59:57-04:00 584.560 584.6100 584.56 584.600 5068.0 \n",
|
||||
"2024-10-18 15:59:58-04:00 584.590 584.6200 584.56 584.560 8786.0 \n",
|
||||
"2024-10-18 15:59:59-04:00 584.560 584.6100 584.56 584.570 12583.0 \n",
|
||||
"\n",
|
||||
" trades updated \\\n",
|
||||
"time \n",
|
||||
"2024-01-16 09:30:00-05:00 93.0 2024-01-16 09:30:01.002183-05:00 \n",
|
||||
"2024-01-16 09:30:01-05:00 100.0 2024-01-16 09:30:02.007313-05:00 \n",
|
||||
"2024-01-16 09:30:02-05:00 39.0 2024-01-16 09:30:03.008912-05:00 \n",
|
||||
"2024-01-16 09:30:03-05:00 47.0 2024-01-16 09:30:04.201093-05:00 \n",
|
||||
"2024-01-16 09:30:04-05:00 37.0 2024-01-16 09:30:05.004980-05:00 \n",
|
||||
"... ... ... \n",
|
||||
"2024-10-18 15:59:55-04:00 47.0 2024-10-18 15:59:56.008928-04:00 \n",
|
||||
"2024-10-18 15:59:56-04:00 32.0 2024-10-18 15:59:57.007658-04:00 \n",
|
||||
"2024-10-18 15:59:57-04:00 23.0 2024-10-18 15:59:58.000435-04:00 \n",
|
||||
"2024-10-18 15:59:58-04:00 23.0 2024-10-18 15:59:59.041984-04:00 \n",
|
||||
"2024-10-18 15:59:59-04:00 69.0 2024-10-18 15:59:59.982132-04:00 \n",
|
||||
"\n",
|
||||
" vwap buyvolume sellvolume \n",
|
||||
"time \n",
|
||||
"2024-01-16 09:30:00-05:00 475.251725 3692.0 242756.0 \n",
|
||||
"2024-01-16 09:30:01-05:00 475.283390 4386.0 4944.0 \n",
|
||||
"2024-01-16 09:30:02-05:00 475.262507 1900.0 2256.0 \n",
|
||||
"2024-01-16 09:30:03-05:00 475.275280 1300.0 3200.0 \n",
|
||||
"2024-01-16 09:30:04-05:00 475.234353 1613.0 1247.0 \n",
|
||||
"... ... ... ... \n",
|
||||
"2024-10-18 15:59:55-04:00 584.543870 1600.0 1100.0 \n",
|
||||
"2024-10-18 15:59:56-04:00 584.566643 1525.0 1002.0 \n",
|
||||
"2024-10-18 15:59:57-04:00 584.596249 1960.0 900.0 \n",
|
||||
"2024-10-18 15:59:58-04:00 584.592217 2859.0 3921.0 \n",
|
||||
"2024-10-18 15:59:59-04:00 584.583131 5303.0 1980.0 \n",
|
||||
"\n",
|
||||
"[3384529 rows x 10 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#loading manually range subset from existing files\n",
|
||||
"start = zoneNY.localize(datetime(2024, 1, 15, 9, 30))\n",
|
||||
"end = zoneNY.localize(datetime(2024, 10, 20, 16, 00))\n",
|
||||
"\n",
|
||||
@ -800,6 +385,121 @@
|
||||
"\n",
|
||||
"ohlcv_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TTOOLS: Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"from ttools.loaders import fetch_daily_stock_trades, fetch_trades_parallel\n",
|
||||
"from ttools.utils import zoneNY\n",
|
||||
"from datetime import datetime"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Fetching trades for whole range"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"SPY Contains 46 market days\n",
|
||||
"SPY All 46 split files loaded in 10.521624088287354 seconds\n",
|
||||
"Trimming 2024-01-16 09:30:00-05:00 2024-03-20 16:00:00-04:00\n",
|
||||
"excluding ['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F', '9', 'M', '6']\n",
|
||||
"exclude done\n",
|
||||
"minsize 100\n",
|
||||
"minsize done\n",
|
||||
"SPY filtered\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"DatetimeIndex: 6513606 entries, 2024-01-16 09:30:00.001443-05:00 to 2024-03-20 15:59:59.992808-04:00\n",
|
||||
"Data columns (total 6 columns):\n",
|
||||
" # Column Dtype \n",
|
||||
"--- ------ ----- \n",
|
||||
" 0 x object \n",
|
||||
" 1 p float64\n",
|
||||
" 2 s int64 \n",
|
||||
" 3 i int64 \n",
|
||||
" 4 c object \n",
|
||||
" 5 z object \n",
|
||||
"dtypes: float64(1), int64(2), object(3)\n",
|
||||
"memory usage: 347.9+ MB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"#fethcing one day\n",
|
||||
"# df = fetch_daily_stock_trades(symbol=\"SPY\",\n",
|
||||
"# start=zoneNY.localize(datetime(2024, 1, 16, 9, 30)),\n",
|
||||
"# end=zoneNY.localize(datetime(2024, 1, 16, 16, 00)))\n",
|
||||
"# df.info()\n",
|
||||
"\n",
|
||||
"#fetching multiple days with parallel\n",
|
||||
"df = fetch_trades_parallel(symbol=\"SPY\",\n",
|
||||
" start_date=zoneNY.localize(datetime(2024, 1, 16, 9, 30)),\n",
|
||||
" end_date=zoneNY.localize(datetime(2024, 3, 20, 16, 00)))\n",
|
||||
"\n",
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#comparing dataframes\n",
|
||||
"from ttools.utils import AGG_CACHE, compare_dataframes\n",
|
||||
"import pandas as pd\n",
|
||||
"file1 = AGG_CACHE / \"SPY-AggType.OHLCV-1-2024-02-15T09-30-00-2024-10-20T16-00-00-4679BCFMOPUVWZ-100-False.parquet\"\n",
|
||||
"file2 = AGG_CACHE / \"SPY-AggType.OHLCV-1-2024-02-15T09-30-00-2024-10-20T16-00-00-4679BCFMOPUVWZ-100-False_older2.parquet\"\n",
|
||||
"df1 = pd.read_parquet(file1)\n",
|
||||
"df2 = pd.read_parquet(file2)\n",
|
||||
"df1.equals(df2)\n",
|
||||
"\n",
|
||||
"#compare_dataframes(df1, df2)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@ -10,8 +10,80 @@ Includes fetch (remote/cached) methods and numba aggregator function for TIME BA
|
||||
|
||||
"""""
|
||||
|
||||
def aggregate_trades_optimized(symbol: str, trades_df: pd.DataFrame, resolution: int, type: AggType = AggType.OHLCV, clear_input: bool = False):
|
||||
"""
|
||||
Optimized version of trade aggregation function with reduced memory footprint.
|
||||
"""
|
||||
# 1. Get timestamps from index if 't' is not in columns
|
||||
if 't' not in trades_df.columns:
|
||||
timestamps = trades_df.index.values
|
||||
else:
|
||||
timestamps = trades_df['t'].values
|
||||
|
||||
# 2. Select only needed columns for prices and sizes
|
||||
prices = trades_df['p'].values
|
||||
sizes = trades_df['s'].values
|
||||
|
||||
#Clears input to freeup memory
|
||||
if clear_input:
|
||||
del trades_df
|
||||
|
||||
# 3. Convert timestamps maintaining exact precision
|
||||
# Convert directly to int64 nanoseconds, then to float seconds
|
||||
unix_timestamps_s = timestamps.view('int64').astype(np.float64) / 1e6
|
||||
#original not optimized, in case of issues (5x slower)
|
||||
#unix_timestamps_s = timestamps.astype('datetime64[ns]').astype(np.float64) / 1e9
|
||||
|
||||
# 4. Create ticks array efficiently
|
||||
# 3. Pre-allocate array for better memory efficiency
|
||||
ticks = np.empty((len(timestamps), 3), dtype=np.float64)
|
||||
ticks[:, 0] = unix_timestamps_s
|
||||
ticks[:, 1] = prices
|
||||
ticks[:, 2] = sizes
|
||||
|
||||
# 5. Clear memory of intermediate objects
|
||||
del timestamps, prices, sizes, unix_timestamps_s
|
||||
|
||||
# 6. Process based on type using existing pattern
|
||||
try:
|
||||
match type:
|
||||
case AggType.OHLCV:
|
||||
ohlcv_bars = generate_time_bars_nb(ticks, resolution)
|
||||
columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades',
|
||||
'updated', 'vwap', 'buyvolume', 'sellvolume']
|
||||
case AggType.OHLCV_VOL:
|
||||
ohlcv_bars = generate_volume_bars_nb(ticks, resolution)
|
||||
columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades',
|
||||
'updated', 'buyvolume', 'sellvolume']
|
||||
case AggType.OHLCV_DOL:
|
||||
ohlcv_bars = generate_dollar_bars_nb(ticks, resolution)
|
||||
columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades',
|
||||
'amount', 'updated']
|
||||
case _:
|
||||
raise ValueError("Invalid AggType type. Supported types are 'time', 'volume' and 'dollar'.")
|
||||
finally:
|
||||
# 7. Clear large numpy array as soon as possible
|
||||
del ticks
|
||||
|
||||
# 8. Create DataFrame and handle timestamps - keeping original working approach
|
||||
ohlcv_df = pd.DataFrame(ohlcv_bars, columns=columns)
|
||||
del ohlcv_bars
|
||||
|
||||
# 9. Use the original timestamp handling that we know works
|
||||
ohlcv_df['time'] = pd.to_datetime(ohlcv_df['time'], unit='s').dt.tz_localize('UTC').dt.tz_convert(zoneNY)
|
||||
ohlcv_df['updated'] = pd.to_datetime(ohlcv_df['updated'], unit="s").dt.tz_localize('UTC').dt.tz_convert(zoneNY)
|
||||
|
||||
# 10. Round microseconds as in original
|
||||
ohlcv_df['updated'] = ohlcv_df['updated'].dt.round('us')
|
||||
|
||||
# 11. Set index last, as in original
|
||||
ohlcv_df.set_index('time', inplace=True)
|
||||
|
||||
return ohlcv_df
|
||||
|
||||
def aggregate_trades(symbol: str, trades_df: pd.DataFrame, resolution: int, type: AggType = AggType.OHLCV):
|
||||
""""
|
||||
Original replaced by optimized version
|
||||
Accepts dataframe with trades keyed by symbol. Preparess dataframe to
|
||||
numpy and calls Numba optimized aggregator for given bar type. (time/volume/dollar)
|
||||
"""""
|
||||
|
||||
@ -17,8 +17,14 @@ from ttools.utils import AggType, fetch_calendar_data, print, print_matching_fil
|
||||
from tqdm import tqdm
|
||||
import threading
|
||||
from typing import List, Union
|
||||
from ttools.aggregator_vectorized import aggregate_trades
|
||||
|
||||
from ttools.aggregator_vectorized import aggregate_trades, aggregate_trades_optimized
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow.dataset as ds
|
||||
import pandas as pd
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import math
|
||||
import os
|
||||
"""
|
||||
Module for fetching stock data. Supports
|
||||
1) cache management
|
||||
@ -87,6 +93,8 @@ def convert_dict_to_multiindex_df(tradesResponse, rename_labels = True, keep_sym
|
||||
final_df.reset_index(inplace=True) # Reset index to remove MultiIndex levels, making them columns
|
||||
final_df.drop(columns=['symbol'], inplace=True) #remove symbol column
|
||||
final_df.set_index(timestamp_col, inplace=True) #reindex by timestamp
|
||||
#print index datetime resolution
|
||||
#print(final_df.index.dtype)
|
||||
|
||||
return final_df
|
||||
|
||||
@ -106,6 +114,28 @@ def filter_trade_df(df: pd.DataFrame, start: datetime = None, end: datetime = No
|
||||
Returns:
|
||||
df: pd.DataFrame
|
||||
"""
|
||||
def fast_filter(df, exclude_conditions):
|
||||
# Convert arrays to strings once
|
||||
str_series = df['c'].apply(lambda x: ','.join(x))
|
||||
|
||||
# Create mask using vectorized string operations
|
||||
mask = np.zeros(len(df), dtype=bool)
|
||||
for cond in exclude_conditions:
|
||||
mask |= str_series.str.contains(cond, regex=False)
|
||||
|
||||
# Apply filter
|
||||
return df[~mask]
|
||||
|
||||
def vectorized_string_sets(df, exclude_conditions):
|
||||
# Convert exclude_conditions to set for O(1) lookup
|
||||
exclude_set = set(exclude_conditions)
|
||||
|
||||
# Vectorized operation using sets intersection
|
||||
arrays = df['c'].values
|
||||
mask = np.array([bool(set(arr) & exclude_set) for arr in arrays])
|
||||
|
||||
return df[~mask]
|
||||
|
||||
# 9:30 to 16:00
|
||||
if main_session_only:
|
||||
|
||||
@ -120,30 +150,50 @@ def filter_trade_df(df: pd.DataFrame, start: datetime = None, end: datetime = No
|
||||
#REQUIRED FILTERING
|
||||
# Create a mask to filter rows within the specified time range
|
||||
if start is not None and end is not None:
|
||||
print(f"filtering {start.time()} {end.time()}")
|
||||
print(f"Trimming {start} {end}")
|
||||
if symbol_included:
|
||||
mask = (df.index.get_level_values('t') >= start) & \
|
||||
(df.index.get_level_values('t') <= end)
|
||||
df = df[mask]
|
||||
else:
|
||||
mask = (df.index >= start) & (df.index <= end)
|
||||
|
||||
# Apply the mask to the DataFrame
|
||||
df = df[mask]
|
||||
df = df.loc[start:end]
|
||||
|
||||
if exclude_conditions is not None:
|
||||
print(f"excluding {exclude_conditions}")
|
||||
# Create a mask to exclude rows with any of the specified conditions
|
||||
mask = df['c'].apply(lambda x: any(cond in exclude_conditions for cond in x))
|
||||
|
||||
# Filter out the rows with specified conditions
|
||||
df = df[~mask]
|
||||
df = vectorized_string_sets(df, exclude_conditions)
|
||||
print("exclude done")
|
||||
|
||||
if minsize is not None:
|
||||
print(f"minsize {minsize}")
|
||||
#exclude conditions
|
||||
df = df[df['s'] >= minsize]
|
||||
print("minsize done")
|
||||
return df
|
||||
|
||||
def calculate_optimal_workers(file_count, min_workers=4, max_workers=32):
|
||||
"""
|
||||
Calculate optimal number of workers based on file count and system resources
|
||||
|
||||
Rules of thumb:
|
||||
- Minimum of 4 workers to ensure parallelization
|
||||
- Maximum of 32 workers to avoid thread overhead
|
||||
- For 100 files, aim for around 16-24 workers
|
||||
- Scale with CPU count but don't exceed max_workers
|
||||
"""
|
||||
cpu_count = os.cpu_count() or 4
|
||||
|
||||
# Base calculation: 2-4x CPU count for I/O bound tasks
|
||||
suggested_workers = cpu_count * 3
|
||||
|
||||
# Scale based on file count (1 worker per 4-6 files is a good ratio)
|
||||
files_based_workers = math.ceil(file_count / 5)
|
||||
|
||||
# Take the smaller of the two suggestions
|
||||
optimal_workers = min(suggested_workers, files_based_workers)
|
||||
|
||||
# Clamp between min and max workers
|
||||
return max(min_workers, min(optimal_workers, max_workers))
|
||||
|
||||
def fetch_daily_stock_trades(symbol, start, end, exclude_conditions=None, minsize=None, main_session_only=True, no_return=False,force_remote=False, rename_labels = False, keep_symbols=False, max_retries=5, backoff_factor=1, data_feed: DataFeed = DataFeed.SIP, verbose = None):
|
||||
#doc for this function
|
||||
"""
|
||||
@ -281,7 +331,12 @@ def fetch_trades_parallel(symbol, start_date, end_date, exclude_conditions = EXC
|
||||
#speed it up , locals first and then fetches
|
||||
s_time = timetime()
|
||||
with trade_cache_lock:
|
||||
local_df = pd.concat([pd.read_parquet(f) for _,f in days_from_cache])
|
||||
file_paths = [f for _, f in days_from_cache]
|
||||
dataset = ds.dataset(file_paths, format='parquet')
|
||||
local_df = dataset.to_table().to_pandas()
|
||||
del dataset
|
||||
#original version
|
||||
#local_df = pd.concat([pd.read_parquet(f) for _,f in days_from_cache])
|
||||
final_time = timetime() - s_time
|
||||
print(f"{symbol} All {len(days_from_cache)} split files loaded in", final_time, "seconds")
|
||||
#the filter is required
|
||||
@ -413,7 +468,7 @@ def load_data(symbol: Union[str, List[str]],
|
||||
else:
|
||||
#neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?
|
||||
df = fetch_trades_parallel(symbol, start_date, end_date, minsize=minsize, exclude_conditions=exclude_conditions, main_session_only=main_session_only, force_remote=force_remote) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])
|
||||
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=resolution, type=agg_type)
|
||||
ohlcv_df = aggregate_trades_optimized(symbol=symbol, trades_df=df, resolution=resolution, type=agg_type, clear_input = True)
|
||||
|
||||
ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow')
|
||||
print(f"{symbol} Saved to agg_cache", file_ohlcv)
|
||||
|
||||
145
ttools/utils.py
145
ttools/utils.py
@ -273,4 +273,147 @@ class StartBarAlign(str, Enum):
|
||||
RANDOM = first bar starts when first trade occurs
|
||||
"""
|
||||
ROUND = "round"
|
||||
RANDOM = "random"
|
||||
RANDOM = "random"
|
||||
|
||||
def compare_dataframes(df1, df2, name1="DataFrame 1", name2="DataFrame 2", check_dtype=True):
|
||||
"""
|
||||
Compare two DataFrames and provide detailed analysis of their differences.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
df1, df2 : pandas.DataFrame
|
||||
The DataFrames to compare
|
||||
name1, name2 : str
|
||||
Names to identify the DataFrames in the output
|
||||
check_dtype : bool
|
||||
Whether to check if dtypes match for columns
|
||||
|
||||
Returns:
|
||||
--------
|
||||
bool
|
||||
True if DataFrames are identical (based on check_dtype parameter)
|
||||
dict
|
||||
Detailed comparison results
|
||||
"""
|
||||
results = {
|
||||
'are_equal': False,
|
||||
'shape_match': False,
|
||||
'column_match': False,
|
||||
'index_match': False,
|
||||
'dtype_match': False,
|
||||
'content_match': False,
|
||||
'differences': {}
|
||||
}
|
||||
|
||||
# Shape comparison
|
||||
if df1.shape != df2.shape:
|
||||
results['differences']['shape'] = {
|
||||
name1: df1.shape,
|
||||
name2: df2.shape
|
||||
}
|
||||
else:
|
||||
results['shape_match'] = True
|
||||
|
||||
# Column comparison
|
||||
cols1 = set(df1.columns)
|
||||
cols2 = set(df2.columns)
|
||||
if cols1 != cols2:
|
||||
results['differences']['columns'] = {
|
||||
f'unique_to_{name1}': list(cols1 - cols2),
|
||||
f'unique_to_{name2}': list(cols2 - cols1),
|
||||
'common': list(cols1 & cols2)
|
||||
}
|
||||
else:
|
||||
results['column_match'] = True
|
||||
|
||||
# Index comparison
|
||||
idx1 = set(df1.index)
|
||||
idx2 = set(df2.index)
|
||||
if idx1 != idx2:
|
||||
results['differences']['index'] = {
|
||||
f'unique_to_{name1}': list(idx1 - idx2),
|
||||
f'unique_to_{name2}': list(idx2 - idx1),
|
||||
'common': list(idx1 & idx2)
|
||||
}
|
||||
else:
|
||||
results['index_match'] = True
|
||||
|
||||
# dtype comparison
|
||||
if check_dtype and results['column_match']:
|
||||
dtype_diff = {}
|
||||
for col in cols1:
|
||||
if df1[col].dtype != df2[col].dtype:
|
||||
dtype_diff[col] = {
|
||||
name1: str(df1[col].dtype),
|
||||
name2: str(df2[col].dtype)
|
||||
}
|
||||
if dtype_diff:
|
||||
results['differences']['dtypes'] = dtype_diff
|
||||
else:
|
||||
results['dtype_match'] = True
|
||||
|
||||
# Content comparison (only for matching columns and indices)
|
||||
if results['column_match'] and results['index_match']:
|
||||
common_cols = list(cols1)
|
||||
common_idx = list(idx1)
|
||||
|
||||
value_diff = {}
|
||||
for col in common_cols:
|
||||
# Compare values
|
||||
if not df1[col].equals(df2[col]):
|
||||
# Find specific differences
|
||||
mask = df1[col] != df2[col]
|
||||
if any(mask):
|
||||
diff_indices = df1.index[mask]
|
||||
value_diff[col] = {
|
||||
'different_at_indices': list(diff_indices),
|
||||
'sample_differences': {
|
||||
str(idx): {
|
||||
name1: df1.loc[idx, col],
|
||||
name2: df2.loc[idx, col]
|
||||
} for idx in list(diff_indices)[:5] # Show first 5 differences
|
||||
}
|
||||
}
|
||||
|
||||
if value_diff:
|
||||
results['differences']['values'] = value_diff
|
||||
else:
|
||||
results['content_match'] = True
|
||||
|
||||
# Overall equality
|
||||
results['are_equal'] = all([
|
||||
results['shape_match'],
|
||||
results['column_match'],
|
||||
results['index_match'],
|
||||
results['content_match'],
|
||||
(results['dtype_match'] if check_dtype else True)
|
||||
])
|
||||
|
||||
# Print summary
|
||||
print(f"\nComparison Summary of {name1} vs {name2}:")
|
||||
print(f"Shape Match: {results['shape_match']} ({df1.shape} vs {df2.shape})")
|
||||
print(f"Column Match: {results['column_match']}")
|
||||
print(f"Index Match: {results['index_match']}")
|
||||
print(f"Dtype Match: {results['dtype_match']}" if check_dtype else "Dtype Check: Skipped")
|
||||
print(f"Content Match: {results['content_match']}")
|
||||
print(f"\nOverall Equal: {results['are_equal']}")
|
||||
|
||||
# Print detailed differences if any
|
||||
if not results['are_equal']:
|
||||
print("\nDetailed Differences:")
|
||||
for diff_type, diff_content in results['differences'].items():
|
||||
print(f"\n{diff_type.upper()}:")
|
||||
if diff_type == 'values':
|
||||
print(f"Number of columns with differences: {len(diff_content)}")
|
||||
for col, details in diff_content.items():
|
||||
print(f"\nColumn '{col}':")
|
||||
print(f"Number of different values: {len(details['different_at_indices'])}")
|
||||
print("First few differences:")
|
||||
for idx, vals in details['sample_differences'].items():
|
||||
print(f" At index {idx}:")
|
||||
print(f" {name1}: {vals[name1]}")
|
||||
print(f" {name2}: {vals[name2]}")
|
||||
else:
|
||||
print(diff_content)
|
||||
|
||||
return results['are_equal'], results
|
||||
|
||||
Reference in New Issue
Block a user