optimalizations

2024-11-01 11:18:10 +01:00
parent c3faa53eff
commit 2116679dba
5 changed files with 491 additions and 521 deletions
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages

 setup(
    name='ttools',
-    version='0.6.4',
+    version='0.7.0',
    packages=find_packages(),
    install_requires=[
        # list your dependencies here
--- a/tests/data_loader_tryme.ipynb
+++ b/tests/data_loader_tryme.ipynb
@@ -40,7 +40,7 @@
    "from ttools.utils import AggType\n",
    "from datetime import datetime\n",
    "from ttools.aggregator_vectorized import generate_time_bars_nb, aggregate_trades\n",
-    "from ttools.loaders import load_data, prepare_trade_cache\n",
+    "from ttools.loaders import load_data, prepare_trade_cache, fetch_daily_stock_trades\n",
    "from ttools.utils import zoneNY\n",
    "import vectorbtpro as vbt\n",
    "from lightweight_charts import PlotDFAccessor, PlotSRAccessor\n",
@@ -69,7 +69,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@@ -110,44 +110,44 @@
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
-       "      <th>2024-02-15 09:30:00-05:00</th>\n",
-       "      <td>499.29</td>\n",
-       "      <td>499.41</td>\n",
-       "      <td>499.2900</td>\n",
-       "      <td>499.3200</td>\n",
-       "      <td>161900.0</td>\n",
+       "      <th>2024-09-16 04:01:24-04:00</th>\n",
+       "      <td>562.22</td>\n",
+       "      <td>562.22</td>\n",
+       "      <td>562.22</td>\n",
+       "      <td>562.22</td>\n",
+       "      <td>200.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-02-15 09:30:01-05:00</th>\n",
-       "      <td>499.32</td>\n",
-       "      <td>499.41</td>\n",
-       "      <td>499.3000</td>\n",
-       "      <td>499.4000</td>\n",
-       "      <td>10900.0</td>\n",
+       "      <th>2024-09-16 04:02:24-04:00</th>\n",
+       "      <td>562.17</td>\n",
+       "      <td>562.17</td>\n",
+       "      <td>562.17</td>\n",
+       "      <td>562.17</td>\n",
+       "      <td>293.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-02-15 09:30:02-05:00</th>\n",
-       "      <td>499.36</td>\n",
-       "      <td>499.40</td>\n",
-       "      <td>499.3550</td>\n",
-       "      <td>499.3800</td>\n",
-       "      <td>7040.0</td>\n",
+       "      <th>2024-09-16 04:04:36-04:00</th>\n",
+       "      <td>562.54</td>\n",
+       "      <td>562.54</td>\n",
+       "      <td>562.54</td>\n",
+       "      <td>562.54</td>\n",
+       "      <td>100.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-02-15 09:30:03-05:00</th>\n",
-       "      <td>499.39</td>\n",
-       "      <td>499.42</td>\n",
-       "      <td>499.3800</td>\n",
-       "      <td>499.4000</td>\n",
-       "      <td>8717.0</td>\n",
+       "      <th>2024-09-16 04:10:00-04:00</th>\n",
+       "      <td>562.39</td>\n",
+       "      <td>562.39</td>\n",
+       "      <td>562.39</td>\n",
+       "      <td>562.39</td>\n",
+       "      <td>102.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-02-15 09:30:04-05:00</th>\n",
-       "      <td>499.40</td>\n",
-       "      <td>499.40</td>\n",
-       "      <td>499.3500</td>\n",
-       "      <td>499.3500</td>\n",
-       "      <td>3265.0</td>\n",
+       "      <th>2024-09-16 04:10:24-04:00</th>\n",
+       "      <td>562.44</td>\n",
+       "      <td>562.44</td>\n",
+       "      <td>562.44</td>\n",
+       "      <td>562.44</td>\n",
+       "      <td>371.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
@@ -158,69 +158,69 @@
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-03-18 15:59:55-04:00</th>\n",
-       "      <td>512.94</td>\n",
-       "      <td>512.94</td>\n",
-       "      <td>512.8600</td>\n",
-       "      <td>512.8900</td>\n",
-       "      <td>7345.0</td>\n",
+       "      <th>2024-10-18 19:57:24-04:00</th>\n",
+       "      <td>584.80</td>\n",
+       "      <td>584.80</td>\n",
+       "      <td>584.80</td>\n",
+       "      <td>584.80</td>\n",
+       "      <td>100.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-03-18 15:59:56-04:00</th>\n",
-       "      <td>512.90</td>\n",
-       "      <td>512.90</td>\n",
-       "      <td>512.8700</td>\n",
-       "      <td>512.8800</td>\n",
-       "      <td>2551.0</td>\n",
+       "      <th>2024-10-18 19:57:48-04:00</th>\n",
+       "      <td>584.84</td>\n",
+       "      <td>584.84</td>\n",
+       "      <td>584.84</td>\n",
+       "      <td>584.84</td>\n",
+       "      <td>622.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-03-18 15:59:57-04:00</th>\n",
-       "      <td>512.89</td>\n",
-       "      <td>512.91</td>\n",
-       "      <td>512.8500</td>\n",
-       "      <td>512.8701</td>\n",
-       "      <td>18063.0</td>\n",
+       "      <th>2024-10-18 19:58:48-04:00</th>\n",
+       "      <td>584.77</td>\n",
+       "      <td>584.79</td>\n",
+       "      <td>584.77</td>\n",
+       "      <td>584.79</td>\n",
+       "      <td>4158.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-03-18 15:59:58-04:00</th>\n",
-       "      <td>512.87</td>\n",
-       "      <td>512.90</td>\n",
-       "      <td>512.8496</td>\n",
-       "      <td>512.9000</td>\n",
-       "      <td>7734.0</td>\n",
+       "      <th>2024-10-18 19:59:36-04:00</th>\n",
+       "      <td>584.80</td>\n",
+       "      <td>584.82</td>\n",
+       "      <td>584.80</td>\n",
+       "      <td>584.82</td>\n",
+       "      <td>298.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>2024-03-18 15:59:59-04:00</th>\n",
-       "      <td>512.92</td>\n",
-       "      <td>512.92</td>\n",
-       "      <td>512.8200</td>\n",
-       "      <td>512.8700</td>\n",
-       "      <td>37159.0</td>\n",
+       "      <th>2024-10-18 19:59:48-04:00</th>\n",
+       "      <td>584.76</td>\n",
+       "      <td>584.76</td>\n",
+       "      <td>584.72</td>\n",
+       "      <td>584.72</td>\n",
+       "      <td>258.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
-       "<p>417345 rows × 5 columns</p>\n",
+       "<p>64218 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
-       "                             open    high       low     close    volume\n",
-       "time                                                                   \n",
-       "2024-02-15 09:30:00-05:00  499.29  499.41  499.2900  499.3200  161900.0\n",
-       "2024-02-15 09:30:01-05:00  499.32  499.41  499.3000  499.4000   10900.0\n",
-       "2024-02-15 09:30:02-05:00  499.36  499.40  499.3550  499.3800    7040.0\n",
-       "2024-02-15 09:30:03-05:00  499.39  499.42  499.3800  499.4000    8717.0\n",
-       "2024-02-15 09:30:04-05:00  499.40  499.40  499.3500  499.3500    3265.0\n",
-       "...                           ...     ...       ...       ...       ...\n",
-       "2024-03-18 15:59:55-04:00  512.94  512.94  512.8600  512.8900    7345.0\n",
-       "2024-03-18 15:59:56-04:00  512.90  512.90  512.8700  512.8800    2551.0\n",
-       "2024-03-18 15:59:57-04:00  512.89  512.91  512.8500  512.8701   18063.0\n",
-       "2024-03-18 15:59:58-04:00  512.87  512.90  512.8496  512.9000    7734.0\n",
-       "2024-03-18 15:59:59-04:00  512.92  512.92  512.8200  512.8700   37159.0\n",
+       "                             open    high     low   close  volume\n",
+       "time                                                             \n",
+       "2024-09-16 04:01:24-04:00  562.22  562.22  562.22  562.22   200.0\n",
+       "2024-09-16 04:02:24-04:00  562.17  562.17  562.17  562.17   293.0\n",
+       "2024-09-16 04:04:36-04:00  562.54  562.54  562.54  562.54   100.0\n",
+       "2024-09-16 04:10:00-04:00  562.39  562.39  562.39  562.39   102.0\n",
+       "2024-09-16 04:10:24-04:00  562.44  562.44  562.44  562.44   371.0\n",
+       "...                           ...     ...     ...     ...     ...\n",
+       "2024-10-18 19:57:24-04:00  584.80  584.80  584.80  584.80   100.0\n",
+       "2024-10-18 19:57:48-04:00  584.84  584.84  584.84  584.84   622.0\n",
+       "2024-10-18 19:58:48-04:00  584.77  584.79  584.77  584.79  4158.0\n",
+       "2024-10-18 19:59:36-04:00  584.80  584.82  584.80  584.82   298.0\n",
+       "2024-10-18 19:59:48-04:00  584.76  584.76  584.72  584.72   258.0\n",
       "\n",
-       "[417345 rows x 5 columns]"
+       "[64218 rows x 5 columns]"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -229,17 +229,17 @@
    "#This is how to call LOAD function\n",
    "symbol = [\"SPY\"]\n",
    "#datetime in zoneNY \n",
-    "day_start = datetime(2024, 2, 15, 9, 30, 0)\n",
-    "day_stop = datetime(2024, 3, 18, 16, 0, 0)\n",
+    "day_start = datetime(2024, 9, 15, 9, 30, 0)\n",
+    "day_stop = datetime(2024, 10, 20, 16, 0, 0)\n",
    "day_start = zoneNY.localize(day_start)\n",
    "day_stop = zoneNY.localize(day_stop)\n",
    "\n",
    "#requested AGG\n",
-    "resolution = 1 #12s bars\n",
+    "resolution = 12 #12s bars\n",
    "agg_type = AggType.OHLCV #other types AggType.OHLCV_VOL, AggType.OHLCV_DOL, AggType.OHLCV_RENKO\n",
    "exclude_conditions = ['C','O','4','B','7','V','P','W','U','Z','F','9','M','6'] #None to defaults\n",
    "minsize = 100 #min trade size to include\n",
-    "main_session_only = True\n",
+    "main_session_only = False\n",
    "force_remote = False\n",
    "\n",
    "data = load_data(symbol = symbol,\n",
@@ -260,162 +260,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>open</th>\n",
-       "      <th>high</th>\n",
-       "      <th>low</th>\n",
-       "      <th>close</th>\n",
-       "      <th>volume</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>time</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2024-10-14 09:45:00-04:00</th>\n",
-       "      <td>41.9650</td>\n",
-       "      <td>41.970</td>\n",
-       "      <td>41.950</td>\n",
-       "      <td>41.9500</td>\n",
-       "      <td>17895.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-14 09:45:12-04:00</th>\n",
-       "      <td>41.9589</td>\n",
-       "      <td>41.965</td>\n",
-       "      <td>41.950</td>\n",
-       "      <td>41.9650</td>\n",
-       "      <td>6281.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-14 09:45:24-04:00</th>\n",
-       "      <td>41.9650</td>\n",
-       "      <td>42.005</td>\n",
-       "      <td>41.965</td>\n",
-       "      <td>41.9975</td>\n",
-       "      <td>3522.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-14 09:45:36-04:00</th>\n",
-       "      <td>41.9900</td>\n",
-       "      <td>42.005</td>\n",
-       "      <td>41.990</td>\n",
-       "      <td>42.0000</td>\n",
-       "      <td>5960.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-14 09:45:48-04:00</th>\n",
-       "      <td>42.0050</td>\n",
-       "      <td>42.040</td>\n",
-       "      <td>42.005</td>\n",
-       "      <td>42.0300</td>\n",
-       "      <td>9113.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-16 15:00:00-04:00</th>\n",
-       "      <td>42.9150</td>\n",
-       "      <td>42.915</td>\n",
-       "      <td>42.910</td>\n",
-       "      <td>42.9100</td>\n",
-       "      <td>12872.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-16 15:00:12-04:00</th>\n",
-       "      <td>42.9150</td>\n",
-       "      <td>42.920</td>\n",
-       "      <td>42.910</td>\n",
-       "      <td>42.9200</td>\n",
-       "      <td>7574.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-16 15:00:24-04:00</th>\n",
-       "      <td>42.9200</td>\n",
-       "      <td>42.920</td>\n",
-       "      <td>42.910</td>\n",
-       "      <td>42.9200</td>\n",
-       "      <td>1769.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-16 15:00:36-04:00</th>\n",
-       "      <td>42.9200</td>\n",
-       "      <td>42.920</td>\n",
-       "      <td>42.905</td>\n",
-       "      <td>42.9050</td>\n",
-       "      <td>26599.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-16 15:00:48-04:00</th>\n",
-       "      <td>42.9050</td>\n",
-       "      <td>42.905</td>\n",
-       "      <td>42.880</td>\n",
-       "      <td>42.8800</td>\n",
-       "      <td>9216.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5480 rows × 5 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                              open    high     low    close   volume\n",
-       "time                                                                \n",
-       "2024-10-14 09:45:00-04:00  41.9650  41.970  41.950  41.9500  17895.0\n",
-       "2024-10-14 09:45:12-04:00  41.9589  41.965  41.950  41.9650   6281.0\n",
-       "2024-10-14 09:45:24-04:00  41.9650  42.005  41.965  41.9975   3522.0\n",
-       "2024-10-14 09:45:36-04:00  41.9900  42.005  41.990  42.0000   5960.0\n",
-       "2024-10-14 09:45:48-04:00  42.0050  42.040  42.005  42.0300   9113.0\n",
-       "...                            ...     ...     ...      ...      ...\n",
-       "2024-10-16 15:00:00-04:00  42.9150  42.915  42.910  42.9100  12872.0\n",
-       "2024-10-16 15:00:12-04:00  42.9150  42.920  42.910  42.9200   7574.0\n",
-       "2024-10-16 15:00:24-04:00  42.9200  42.920  42.910  42.9200   1769.0\n",
-       "2024-10-16 15:00:36-04:00  42.9200  42.920  42.905  42.9050  26599.0\n",
-       "2024-10-16 15:00:48-04:00  42.9050  42.905  42.880  42.8800   9216.0\n",
-       "\n",
-       "[5480 rows x 5 columns]"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "data.ohlcv.data[symbol[0]]"
   ]
@@ -478,26 +325,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "File: SPY-AggType.OHLCV-12-2024-01-15T09-30-00-2024-10-20T16-00-00-4679BCFMOPUVWZ-100-True.parquet\n",
-      "Coverage: 2024-01-15 09:30:00 to 2024-10-20 16:00:00\n",
-      "Symbol: SPY\n",
-      "Agg Type: AggType.OHLCV\n",
-      "Resolution: 12\n",
-      "Excludes: 4679BCFMOPUVWZ\n",
-      "Minsize: 100\n",
-      "Main Session Only: True\n",
-      "--------------------------------------------------------------------------------\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from ttools.utils import list_matching_files, print_matching_files_info, zoneNY\n",
    "from datetime import datetime\n",
@@ -533,261 +363,16 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "And date subset loaded from parquet. Usually this is all done yb `load_data` in loader."
+    "From this file the subset of dates are loaded. Usually this is all done automatically by `load_data` in loader."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>open</th>\n",
-       "      <th>high</th>\n",
-       "      <th>low</th>\n",
-       "      <th>close</th>\n",
-       "      <th>volume</th>\n",
-       "      <th>trades</th>\n",
-       "      <th>updated</th>\n",
-       "      <th>vwap</th>\n",
-       "      <th>buyvolume</th>\n",
-       "      <th>sellvolume</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>time</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2024-01-16 09:30:00-05:00</th>\n",
-       "      <td>475.250</td>\n",
-       "      <td>475.3600</td>\n",
-       "      <td>475.20</td>\n",
-       "      <td>475.285</td>\n",
-       "      <td>255386.0</td>\n",
-       "      <td>93.0</td>\n",
-       "      <td>2024-01-16 09:30:01.002183-05:00</td>\n",
-       "      <td>475.251725</td>\n",
-       "      <td>3692.0</td>\n",
-       "      <td>242756.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-01-16 09:30:01-05:00</th>\n",
-       "      <td>475.335</td>\n",
-       "      <td>475.3350</td>\n",
-       "      <td>475.23</td>\n",
-       "      <td>475.260</td>\n",
-       "      <td>15161.0</td>\n",
-       "      <td>100.0</td>\n",
-       "      <td>2024-01-16 09:30:02.007313-05:00</td>\n",
-       "      <td>475.283390</td>\n",
-       "      <td>4386.0</td>\n",
-       "      <td>4944.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-01-16 09:30:02-05:00</th>\n",
-       "      <td>475.250</td>\n",
-       "      <td>475.3000</td>\n",
-       "      <td>475.24</td>\n",
-       "      <td>475.300</td>\n",
-       "      <td>6993.0</td>\n",
-       "      <td>39.0</td>\n",
-       "      <td>2024-01-16 09:30:03.008912-05:00</td>\n",
-       "      <td>475.262507</td>\n",
-       "      <td>1900.0</td>\n",
-       "      <td>2256.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-01-16 09:30:03-05:00</th>\n",
-       "      <td>475.290</td>\n",
-       "      <td>475.3200</td>\n",
-       "      <td>475.24</td>\n",
-       "      <td>475.270</td>\n",
-       "      <td>8497.0</td>\n",
-       "      <td>47.0</td>\n",
-       "      <td>2024-01-16 09:30:04.201093-05:00</td>\n",
-       "      <td>475.275280</td>\n",
-       "      <td>1300.0</td>\n",
-       "      <td>3200.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-01-16 09:30:04-05:00</th>\n",
-       "      <td>475.250</td>\n",
-       "      <td>475.2700</td>\n",
-       "      <td>475.22</td>\n",
-       "      <td>475.270</td>\n",
-       "      <td>5367.0</td>\n",
-       "      <td>37.0</td>\n",
-       "      <td>2024-01-16 09:30:05.004980-05:00</td>\n",
-       "      <td>475.234353</td>\n",
-       "      <td>1613.0</td>\n",
-       "      <td>1247.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-18 15:59:55-04:00</th>\n",
-       "      <td>584.520</td>\n",
-       "      <td>584.5800</td>\n",
-       "      <td>584.51</td>\n",
-       "      <td>584.580</td>\n",
-       "      <td>10357.0</td>\n",
-       "      <td>47.0</td>\n",
-       "      <td>2024-10-18 15:59:56.008928-04:00</td>\n",
-       "      <td>584.543870</td>\n",
-       "      <td>1600.0</td>\n",
-       "      <td>1100.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-18 15:59:56-04:00</th>\n",
-       "      <td>584.570</td>\n",
-       "      <td>584.6091</td>\n",
-       "      <td>584.55</td>\n",
-       "      <td>584.550</td>\n",
-       "      <td>6527.0</td>\n",
-       "      <td>32.0</td>\n",
-       "      <td>2024-10-18 15:59:57.007658-04:00</td>\n",
-       "      <td>584.566643</td>\n",
-       "      <td>1525.0</td>\n",
-       "      <td>1002.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-18 15:59:57-04:00</th>\n",
-       "      <td>584.560</td>\n",
-       "      <td>584.6100</td>\n",
-       "      <td>584.56</td>\n",
-       "      <td>584.600</td>\n",
-       "      <td>5068.0</td>\n",
-       "      <td>23.0</td>\n",
-       "      <td>2024-10-18 15:59:58.000435-04:00</td>\n",
-       "      <td>584.596249</td>\n",
-       "      <td>1960.0</td>\n",
-       "      <td>900.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-18 15:59:58-04:00</th>\n",
-       "      <td>584.590</td>\n",
-       "      <td>584.6200</td>\n",
-       "      <td>584.56</td>\n",
-       "      <td>584.560</td>\n",
-       "      <td>8786.0</td>\n",
-       "      <td>23.0</td>\n",
-       "      <td>2024-10-18 15:59:59.041984-04:00</td>\n",
-       "      <td>584.592217</td>\n",
-       "      <td>2859.0</td>\n",
-       "      <td>3921.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-10-18 15:59:59-04:00</th>\n",
-       "      <td>584.560</td>\n",
-       "      <td>584.6100</td>\n",
-       "      <td>584.56</td>\n",
-       "      <td>584.570</td>\n",
-       "      <td>12583.0</td>\n",
-       "      <td>69.0</td>\n",
-       "      <td>2024-10-18 15:59:59.982132-04:00</td>\n",
-       "      <td>584.583131</td>\n",
-       "      <td>5303.0</td>\n",
-       "      <td>1980.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>3384529 rows × 10 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                              open      high     low    close    volume  \\\n",
-       "time                                                                      \n",
-       "2024-01-16 09:30:00-05:00  475.250  475.3600  475.20  475.285  255386.0   \n",
-       "2024-01-16 09:30:01-05:00  475.335  475.3350  475.23  475.260   15161.0   \n",
-       "2024-01-16 09:30:02-05:00  475.250  475.3000  475.24  475.300    6993.0   \n",
-       "2024-01-16 09:30:03-05:00  475.290  475.3200  475.24  475.270    8497.0   \n",
-       "2024-01-16 09:30:04-05:00  475.250  475.2700  475.22  475.270    5367.0   \n",
-       "...                            ...       ...     ...      ...       ...   \n",
-       "2024-10-18 15:59:55-04:00  584.520  584.5800  584.51  584.580   10357.0   \n",
-       "2024-10-18 15:59:56-04:00  584.570  584.6091  584.55  584.550    6527.0   \n",
-       "2024-10-18 15:59:57-04:00  584.560  584.6100  584.56  584.600    5068.0   \n",
-       "2024-10-18 15:59:58-04:00  584.590  584.6200  584.56  584.560    8786.0   \n",
-       "2024-10-18 15:59:59-04:00  584.560  584.6100  584.56  584.570   12583.0   \n",
-       "\n",
-       "                           trades                          updated  \\\n",
-       "time                                                                 \n",
-       "2024-01-16 09:30:00-05:00    93.0 2024-01-16 09:30:01.002183-05:00   \n",
-       "2024-01-16 09:30:01-05:00   100.0 2024-01-16 09:30:02.007313-05:00   \n",
-       "2024-01-16 09:30:02-05:00    39.0 2024-01-16 09:30:03.008912-05:00   \n",
-       "2024-01-16 09:30:03-05:00    47.0 2024-01-16 09:30:04.201093-05:00   \n",
-       "2024-01-16 09:30:04-05:00    37.0 2024-01-16 09:30:05.004980-05:00   \n",
-       "...                           ...                              ...   \n",
-       "2024-10-18 15:59:55-04:00    47.0 2024-10-18 15:59:56.008928-04:00   \n",
-       "2024-10-18 15:59:56-04:00    32.0 2024-10-18 15:59:57.007658-04:00   \n",
-       "2024-10-18 15:59:57-04:00    23.0 2024-10-18 15:59:58.000435-04:00   \n",
-       "2024-10-18 15:59:58-04:00    23.0 2024-10-18 15:59:59.041984-04:00   \n",
-       "2024-10-18 15:59:59-04:00    69.0 2024-10-18 15:59:59.982132-04:00   \n",
-       "\n",
-       "                                 vwap  buyvolume  sellvolume  \n",
-       "time                                                          \n",
-       "2024-01-16 09:30:00-05:00  475.251725     3692.0    242756.0  \n",
-       "2024-01-16 09:30:01-05:00  475.283390     4386.0      4944.0  \n",
-       "2024-01-16 09:30:02-05:00  475.262507     1900.0      2256.0  \n",
-       "2024-01-16 09:30:03-05:00  475.275280     1300.0      3200.0  \n",
-       "2024-01-16 09:30:04-05:00  475.234353     1613.0      1247.0  \n",
-       "...                               ...        ...         ...  \n",
-       "2024-10-18 15:59:55-04:00  584.543870     1600.0      1100.0  \n",
-       "2024-10-18 15:59:56-04:00  584.566643     1525.0      1002.0  \n",
-       "2024-10-18 15:59:57-04:00  584.596249     1960.0       900.0  \n",
-       "2024-10-18 15:59:58-04:00  584.592217     2859.0      3921.0  \n",
-       "2024-10-18 15:59:59-04:00  584.583131     5303.0      1980.0  \n",
-       "\n",
-       "[3384529 rows x 10 columns]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
+    "#loading manually range subset from existing files\n",
    "start = zoneNY.localize(datetime(2024, 1, 15, 9, 30))\n",
    "end = zoneNY.localize(datetime(2024, 10, 20, 16, 00))\n",
    "\n",
@@ -800,6 +385,121 @@
    "\n",
    "ohlcv_df"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TTOOLS: Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "from ttools.loaders import fetch_daily_stock_trades, fetch_trades_parallel\n",
+    "from ttools.utils import zoneNY\n",
+    "from datetime import datetime"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fetching trades for whole range"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SPY Contains 46  market days\n",
+      "SPY All 46 split files loaded in 10.521624088287354 seconds\n",
+      "Trimming 2024-01-16 09:30:00-05:00 2024-03-20 16:00:00-04:00\n",
+      "excluding ['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F', '9', 'M', '6']\n",
+      "exclude done\n",
+      "minsize 100\n",
+      "minsize done\n",
+      "SPY filtered\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "DatetimeIndex: 6513606 entries, 2024-01-16 09:30:00.001443-05:00 to 2024-03-20 15:59:59.992808-04:00\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column  Dtype  \n",
+      "---  ------  -----  \n",
+      " 0   x       object \n",
+      " 1   p       float64\n",
+      " 2   s       int64  \n",
+      " 3   i       int64  \n",
+      " 4   c       object \n",
+      " 5   z       object \n",
+      "dtypes: float64(1), int64(2), object(3)\n",
+      "memory usage: 347.9+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "#fethcing one day\n",
+    "# df = fetch_daily_stock_trades(symbol=\"SPY\",\n",
+    "#                               start=zoneNY.localize(datetime(2024, 1, 16, 9, 30)),\n",
+    "#                               end=zoneNY.localize(datetime(2024, 1, 16, 16, 00)))\n",
+    "# df.info()\n",
+    "\n",
+    "#fetching multiple days with parallel\n",
+    "df = fetch_trades_parallel(symbol=\"SPY\",\n",
+    "                              start_date=zoneNY.localize(datetime(2024, 1, 16, 9, 30)),\n",
+    "                              end_date=zoneNY.localize(datetime(2024, 3, 20, 16, 00)))\n",
+    "\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#comparing dataframes\n",
+    "from ttools.utils import AGG_CACHE, compare_dataframes\n",
+    "import pandas as pd\n",
+    "file1 = AGG_CACHE / \"SPY-AggType.OHLCV-1-2024-02-15T09-30-00-2024-10-20T16-00-00-4679BCFMOPUVWZ-100-False.parquet\"\n",
+    "file2 = AGG_CACHE / \"SPY-AggType.OHLCV-1-2024-02-15T09-30-00-2024-10-20T16-00-00-4679BCFMOPUVWZ-100-False_older2.parquet\"\n",
+    "df1 = pd.read_parquet(file1)\n",
+    "df2 = pd.read_parquet(file2)\n",
+    "df1.equals(df2)\n",
+    "\n",
+    "#compare_dataframes(df1, df2)"
+   ]
  }
 ],
 "metadata": {
--- a/ttools/aggregator_vectorized.py
+++ b/ttools/aggregator_vectorized.py
@@ -10,8 +10,80 @@ Includes fetch (remote/cached) methods and numba aggregator function for TIME BA

 """""

+def aggregate_trades_optimized(symbol: str, trades_df: pd.DataFrame, resolution: int, type: AggType = AggType.OHLCV, clear_input: bool = False):
+    """
+    Optimized version of trade aggregation function with reduced memory footprint.
+    """
+    # 1. Get timestamps from index if 't' is not in columns
+    if 't' not in trades_df.columns:
+        timestamps = trades_df.index.values
+    else:
+        timestamps = trades_df['t'].values
+    
+    # 2. Select only needed columns for prices and sizes
+    prices = trades_df['p'].values
+    sizes = trades_df['s'].values
+    
+    #Clears input to freeup memory
+    if clear_input:
+        del trades_df
+
+    # 3. Convert timestamps maintaining exact precision
+    # Convert directly to int64 nanoseconds, then to float seconds
+    unix_timestamps_s = timestamps.view('int64').astype(np.float64) / 1e6
+    #original not optimized, in case of issues (5x slower)
+    #unix_timestamps_s = timestamps.astype('datetime64[ns]').astype(np.float64) / 1e9
+
+    # 4. Create ticks array efficiently
+    # 3. Pre-allocate array for better memory efficiency
+    ticks = np.empty((len(timestamps), 3), dtype=np.float64)
+    ticks[:, 0] = unix_timestamps_s
+    ticks[:, 1] = prices
+    ticks[:, 2] = sizes
+    
+    # 5. Clear memory of intermediate objects
+    del timestamps, prices, sizes, unix_timestamps_s
+    
+    # 6. Process based on type using existing pattern
+    try:
+        match type:
+            case AggType.OHLCV:
+                ohlcv_bars = generate_time_bars_nb(ticks, resolution)
+                columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades', 
+                          'updated', 'vwap', 'buyvolume', 'sellvolume']
+            case AggType.OHLCV_VOL:
+                ohlcv_bars = generate_volume_bars_nb(ticks, resolution)
+                columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades',
+                          'updated', 'buyvolume', 'sellvolume']
+            case AggType.OHLCV_DOL:
+                ohlcv_bars = generate_dollar_bars_nb(ticks, resolution)
+                columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades',
+                          'amount', 'updated']
+            case _:
+                raise ValueError("Invalid AggType type. Supported types are 'time', 'volume' and 'dollar'.")
+    finally:
+        # 7. Clear large numpy array as soon as possible
+        del ticks
+    
+    # 8. Create DataFrame and handle timestamps - keeping original working approach
+    ohlcv_df = pd.DataFrame(ohlcv_bars, columns=columns)
+    del ohlcv_bars
+
+    # 9. Use the original timestamp handling that we know works
+    ohlcv_df['time'] = pd.to_datetime(ohlcv_df['time'], unit='s').dt.tz_localize('UTC').dt.tz_convert(zoneNY)
+    ohlcv_df['updated'] = pd.to_datetime(ohlcv_df['updated'], unit="s").dt.tz_localize('UTC').dt.tz_convert(zoneNY)
+    
+    # 10. Round microseconds as in original
+    ohlcv_df['updated'] = ohlcv_df['updated'].dt.round('us')
+    
+    # 11. Set index last, as in original
+    ohlcv_df.set_index('time', inplace=True)
+    
+    return ohlcv_df
+
 def aggregate_trades(symbol: str, trades_df: pd.DataFrame, resolution: int, type: AggType = AggType.OHLCV):
    """"
+    Original replaced by optimized version
    Accepts dataframe with trades keyed by symbol. Preparess dataframe to 
    numpy and calls Numba optimized aggregator for given bar type. (time/volume/dollar)
    """""
--- a/ttools/loaders.py
+++ b/ttools/loaders.py
@@ -17,8 +17,14 @@ from ttools.utils import AggType, fetch_calendar_data, print, print_matching_fil
 from tqdm import tqdm
 import threading
 from typing import List, Union
-from ttools.aggregator_vectorized import aggregate_trades
-
+from ttools.aggregator_vectorized import aggregate_trades, aggregate_trades_optimized
+import numpy as np
+import pandas as pd
+import pyarrow.dataset as ds
+import pandas as pd
+from concurrent.futures import ThreadPoolExecutor
+import math
+import os
 """
 Module for fetching stock data. Supports
 1) cache management
@@ -87,6 +93,8 @@ def convert_dict_to_multiindex_df(tradesResponse, rename_labels = True, keep_sym
        final_df.reset_index(inplace=True) # Reset index to remove MultiIndex levels, making them columns
        final_df.drop(columns=['symbol'], inplace=True) #remove symbol column
        final_df.set_index(timestamp_col, inplace=True) #reindex by timestamp
+        #print index datetime resolution
+        #print(final_df.index.dtype)

    return final_df

@@ -106,6 +114,28 @@ def filter_trade_df(df: pd.DataFrame, start: datetime = None, end: datetime = No
    Returns:
    df: pd.DataFrame
    """
+    def fast_filter(df, exclude_conditions):
+        # Convert arrays to strings once
+        str_series = df['c'].apply(lambda x: ','.join(x))
+        
+        # Create mask using vectorized string operations
+        mask = np.zeros(len(df), dtype=bool)
+        for cond in exclude_conditions:
+            mask |= str_series.str.contains(cond, regex=False)
+        
+        # Apply filter
+        return df[~mask]
+
+    def vectorized_string_sets(df, exclude_conditions):
+        # Convert exclude_conditions to set for O(1) lookup
+        exclude_set = set(exclude_conditions)
+        
+        # Vectorized operation using sets intersection
+        arrays = df['c'].values
+        mask = np.array([bool(set(arr) & exclude_set) for arr in arrays])
+        
+        return df[~mask]
+
    # 9:30 to 16:00
    if main_session_only:

@@ -120,30 +150,50 @@ def filter_trade_df(df: pd.DataFrame, start: datetime = None, end: datetime = No
    #REQUIRED FILTERING
    # Create a mask to filter rows within the specified time range
    if start is not None and end is not None:
-        print(f"filtering {start.time()} {end.time()}")
+        print(f"Trimming {start} {end}")
        if symbol_included:
            mask = (df.index.get_level_values('t') >= start) & \
                (df.index.get_level_values('t') <= end)
+            df = df[mask]
        else:
-            mask = (df.index >= start) & (df.index <= end)
-
-        # Apply the mask to the DataFrame
-        df = df[mask]
+            df = df.loc[start:end]

    if exclude_conditions is not None:
        print(f"excluding {exclude_conditions}")
-        # Create a mask to exclude rows with any of the specified conditions
-        mask = df['c'].apply(lambda x: any(cond in exclude_conditions for cond in x))
-
-        # Filter out the rows with specified conditions
-        df = df[~mask]
+        df = vectorized_string_sets(df, exclude_conditions)
+        print("exclude done")

    if minsize is not None:
        print(f"minsize {minsize}")
        #exclude conditions
        df = df[df['s'] >= minsize]
+        print("minsize done")
    return df

+def calculate_optimal_workers(file_count, min_workers=4, max_workers=32):
+    """
+    Calculate optimal number of workers based on file count and system resources
+    
+    Rules of thumb:
+    - Minimum of 4 workers to ensure parallelization
+    - Maximum of 32 workers to avoid thread overhead
+    - For 100 files, aim for around 16-24 workers
+    - Scale with CPU count but don't exceed max_workers
+    """
+    cpu_count = os.cpu_count() or 4
+    
+    # Base calculation: 2-4x CPU count for I/O bound tasks
+    suggested_workers = cpu_count * 3
+    
+    # Scale based on file count (1 worker per 4-6 files is a good ratio)
+    files_based_workers = math.ceil(file_count / 5)
+    
+    # Take the smaller of the two suggestions
+    optimal_workers = min(suggested_workers, files_based_workers)
+    
+    # Clamp between min and max workers
+    return max(min_workers, min(optimal_workers, max_workers))
+
 def fetch_daily_stock_trades(symbol, start, end, exclude_conditions=None, minsize=None, main_session_only=True, no_return=False,force_remote=False, rename_labels = False, keep_symbols=False, max_retries=5, backoff_factor=1, data_feed: DataFeed = DataFeed.SIP, verbose = None):
    #doc for this function
    """
@@ -281,7 +331,12 @@ def fetch_trades_parallel(symbol, start_date, end_date, exclude_conditions = EXC
        #speed it up , locals first and then fetches
        s_time = timetime()
        with trade_cache_lock:
-            local_df = pd.concat([pd.read_parquet(f) for _,f in days_from_cache])
+            file_paths = [f for _, f in days_from_cache]
+            dataset = ds.dataset(file_paths, format='parquet')
+            local_df = dataset.to_table().to_pandas()
+            del dataset
+            #original version          
+            #local_df = pd.concat([pd.read_parquet(f) for _,f in days_from_cache])
        final_time = timetime() - s_time
        print(f"{symbol} All {len(days_from_cache)} split files loaded in", final_time, "seconds")
        #the filter is required
@@ -413,7 +468,7 @@ def load_data(symbol: Union[str, List[str]],
        else:
            #neslo by zrychlit, kdyz se zobrazuje pomalu Searching cache - nejaky bottle neck?
            df = fetch_trades_parallel(symbol, start_date, end_date, minsize=minsize, exclude_conditions=exclude_conditions, main_session_only=main_session_only, force_remote=force_remote) #exclude_conditions=['C','O','4','B','7','V','P','W','U','Z','F'])
-            ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=resolution, type=agg_type)
+            ohlcv_df = aggregate_trades_optimized(symbol=symbol, trades_df=df, resolution=resolution, type=agg_type, clear_input = True)

            ohlcv_df.to_parquet(file_ohlcv, engine='pyarrow')
            print(f"{symbol} Saved to agg_cache", file_ohlcv)    
--- a/ttools/utils.py
+++ b/ttools/utils.py
@@ -273,4 +273,147 @@ class StartBarAlign(str, Enum):
        RANDOM = first bar starts when first trade occurs
    """ 
    ROUND = "round"
-    RANDOM = "random"
+    RANDOM = "random"
+
+def compare_dataframes(df1, df2, name1="DataFrame 1", name2="DataFrame 2", check_dtype=True):
+    """
+    Compare two DataFrames and provide detailed analysis of their differences.
+    
+    Parameters:
+    -----------
+    df1, df2 : pandas.DataFrame
+        The DataFrames to compare
+    name1, name2 : str
+        Names to identify the DataFrames in the output
+    check_dtype : bool
+        Whether to check if dtypes match for columns
+        
+    Returns:
+    --------
+    bool
+        True if DataFrames are identical (based on check_dtype parameter)
+    dict
+        Detailed comparison results
+    """
+    results = {
+        'are_equal': False,
+        'shape_match': False,
+        'column_match': False,
+        'index_match': False,
+        'dtype_match': False,
+        'content_match': False,
+        'differences': {}
+    }
+    
+    # Shape comparison
+    if df1.shape != df2.shape:
+        results['differences']['shape'] = {
+            name1: df1.shape,
+            name2: df2.shape
+        }
+    else:
+        results['shape_match'] = True
+    
+    # Column comparison
+    cols1 = set(df1.columns)
+    cols2 = set(df2.columns)
+    if cols1 != cols2:
+        results['differences']['columns'] = {
+            f'unique_to_{name1}': list(cols1 - cols2),
+            f'unique_to_{name2}': list(cols2 - cols1),
+            'common': list(cols1 & cols2)
+        }
+    else:
+        results['column_match'] = True
+    
+    # Index comparison
+    idx1 = set(df1.index)
+    idx2 = set(df2.index)
+    if idx1 != idx2:
+        results['differences']['index'] = {
+            f'unique_to_{name1}': list(idx1 - idx2),
+            f'unique_to_{name2}': list(idx2 - idx1),
+            'common': list(idx1 & idx2)
+        }
+    else:
+        results['index_match'] = True
+    
+    # dtype comparison
+    if check_dtype and results['column_match']:
+        dtype_diff = {}
+        for col in cols1:
+            if df1[col].dtype != df2[col].dtype:
+                dtype_diff[col] = {
+                    name1: str(df1[col].dtype),
+                    name2: str(df2[col].dtype)
+                }
+        if dtype_diff:
+            results['differences']['dtypes'] = dtype_diff
+        else:
+            results['dtype_match'] = True
+    
+    # Content comparison (only for matching columns and indices)
+    if results['column_match'] and results['index_match']:
+        common_cols = list(cols1)
+        common_idx = list(idx1)
+        
+        value_diff = {}
+        for col in common_cols:
+            # Compare values
+            if not df1[col].equals(df2[col]):
+                # Find specific differences
+                mask = df1[col] != df2[col]
+                if any(mask):
+                    diff_indices = df1.index[mask]
+                    value_diff[col] = {
+                        'different_at_indices': list(diff_indices),
+                        'sample_differences': {
+                            str(idx): {
+                                name1: df1.loc[idx, col],
+                                name2: df2.loc[idx, col]
+                            } for idx in list(diff_indices)[:5]  # Show first 5 differences
+                        }
+                    }
+        
+        if value_diff:
+            results['differences']['values'] = value_diff
+        else:
+            results['content_match'] = True
+    
+    # Overall equality
+    results['are_equal'] = all([
+        results['shape_match'],
+        results['column_match'],
+        results['index_match'],
+        results['content_match'],
+        (results['dtype_match'] if check_dtype else True)
+    ])
+    
+    # Print summary
+    print(f"\nComparison Summary of {name1} vs {name2}:")
+    print(f"Shape Match: {results['shape_match']} ({df1.shape} vs {df2.shape})")
+    print(f"Column Match: {results['column_match']}")
+    print(f"Index Match: {results['index_match']}")
+    print(f"Dtype Match: {results['dtype_match']}" if check_dtype else "Dtype Check: Skipped")
+    print(f"Content Match: {results['content_match']}")
+    print(f"\nOverall Equal: {results['are_equal']}")
+    
+    # Print detailed differences if any
+    if not results['are_equal']:
+        print("\nDetailed Differences:")
+        for diff_type, diff_content in results['differences'].items():
+            print(f"\n{diff_type.upper()}:")
+            if diff_type == 'values':
+                print(f"Number of columns with differences: {len(diff_content)}")
+                for col, details in diff_content.items():
+                    print(f"\nColumn '{col}':")
+                    print(f"Number of different values: {len(details['different_at_indices'])}")
+                    print("First few differences:")
+                    for idx, vals in details['sample_differences'].items():
+                        print(f"  At index {idx}:")
+                        print(f"    {name1}: {vals[name1]}")
+                        print(f"    {name2}: {vals[name2]}")
+            else:
+                print(diff_content)
+    
+    return results['are_equal'], results