Files
v2realbot/research/vectorized_loader.ipynb
2024-04-25 06:24:51 +02:00

22 KiB
Raw Blame History

In [37]:
import pandas as pd
import pyarrow
import numpy as np
from numba import jit
import v2realbot.utils.config_handler as cfh
In [38]:
tdf=pd.read_parquet('trades_bac.parquet',engine='pyarrow')
#print(df)
df = tdf.loc['BAC']
df.info()
df
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   exchange    190261 non-null  object 
 1   price       190261 non-null  float64
 2   size        190261 non-null  float64
 3   id          190261 non-null  int64  
 4   conditions  190261 non-null  object 
 5   tape        190261 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 10.2+ MB
Out[38]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
timestamp
2024-04-22 13:30:00.267711+00:00 K 36.890 5.0 52983525037630 [ , F, I] A
2024-04-22 13:30:00.300501+00:00 D 37.005 1.0 71675241117014 [ , I] A
2024-04-22 13:30:00.305439+00:00 D 37.005 1.0 71675241117496 [ , I] A
2024-04-22 13:30:00.314520+00:00 D 37.005 1.0 71675241118034 [ , I] A
2024-04-22 13:30:00.335201+00:00 D 37.005 1.0 71675241121369 [ , I] A
... ... ... ... ... ... ...
2024-04-22 19:59:59.902614+00:00 V 37.750 1100.0 56480705310575 [ ] A
2024-04-22 19:59:59.977134+00:00 N 37.745 300.0 52983559963478 [ ] A
2024-04-22 19:59:59.977137+00:00 N 37.740 7300.0 52983559963696 [ ] A
2024-04-22 19:59:59.978626+00:00 V 37.750 16.0 56480706886228 [ , I] A
2024-04-22 19:59:59.987614+00:00 N 37.745 30.0 52983559963958 [ , I] A

190261 rows × 6 columns

In [39]:
@jit(nopython=True)
def ohlcv_bars(ticks, start_time, end_time, resolution):
    """
    Generate OHLCV bars from tick data, skipping intervals without trading activity.
    
    Parameters:
    - ticks: numpy array with columns [timestamp, price, size]
    - start_time: the start timestamp for bars (Unix timestamp)
    - end_time: the end timestamp for bars (Unix timestamp)
    - resolution: time resolution in seconds
    
    Returns:
    - OHLCV bars as a numpy array
    """
    num_bars = (end_time - start_time) // resolution + 1
    bar_list = []

    for i in range(num_bars):
        bar_start_time = start_time + i * resolution
        bar_end_time = bar_start_time + resolution
        bar_ticks = ticks[(ticks[:, 0] >= bar_start_time) & (ticks[:, 0] < bar_end_time)]
        
        if bar_ticks.shape[0] == 0:
            continue  # Skip this bar as there are no ticks

        # Calculate OHLCV values
        open_price = bar_ticks[0, 1]  # open
        high_price = np.max(bar_ticks[:, 1])  # high
        low_price = np.min(bar_ticks[:, 1])  # low
        close_price = bar_ticks[-1, 1]  # close
        volume = np.sum(bar_ticks[:, 2])  # volume
        bar_time = bar_start_time  # timestamp for the bar

        bar_list.append([open_price, high_price, low_price, close_price, volume, bar_time])

    # Convert list to numpy array
    if bar_list:
        ohlcv = np.array(bar_list)
    else:
        ohlcv = np.empty((0, 6))  # return an empty array if no bars were created

    return ohlcv
In [40]:
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   exchange    190261 non-null  object 
 1   price       190261 non-null  float64
 2   size        190261 non-null  float64
 3   id          190261 non-null  int64  
 4   conditions  190261 non-null  object 
 5   tape        190261 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 10.2+ MB
In [41]:
excludes = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES')
print(excludes)
#excludes = ["F", "I"]
# FILTER EXCLUDED TRADES
# Filter rows to exclude those where 'conditions' contains 'F' or 'I'
# This simplifies the logic by directly using ~ (bitwise not operator) with np.isin
df = df[~df['conditions'].apply(lambda x: np.isin(x, excludes).any())]
['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 143751 entries, 2024-04-22 13:30:00.300501+00:00 to 2024-04-22 19:59:59.987614+00:00
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   exchange    143751 non-null  object 
 1   price       143751 non-null  float64
 2   size        143751 non-null  float64
 3   id          143751 non-null  int64  
 4   conditions  143751 non-null  object 
 5   tape        143751 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 7.7+ MB
Out[41]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
exchange price size id conditions tape
timestamp
2024-04-22 13:30:00.300501+00:00 D 37.005 1.0 71675241117014 [ , I] A
2024-04-22 13:30:00.305439+00:00 D 37.005 1.0 71675241117496 [ , I] A
2024-04-22 13:30:00.314520+00:00 D 37.005 1.0 71675241118034 [ , I] A
2024-04-22 13:30:00.335201+00:00 D 37.005 1.0 71675241121369 [ , I] A
2024-04-22 13:30:00.346219+00:00 D 37.005 1.0 71675241122389 [ , I] A
... ... ... ... ... ... ...
2024-04-22 19:59:59.902614+00:00 V 37.750 1100.0 56480705310575 [ ] A
2024-04-22 19:59:59.977134+00:00 N 37.745 300.0 52983559963478 [ ] A
2024-04-22 19:59:59.977137+00:00 N 37.740 7300.0 52983559963696 [ ] A
2024-04-22 19:59:59.978626+00:00 V 37.750 16.0 56480706886228 [ , I] A
2024-04-22 19:59:59.987614+00:00 N 37.745 30.0 52983559963958 [ , I] A

143751 rows × 6 columns

In [46]:
# Creating a structured array with the timestamp as the first element
structured_array = np.array(list(zip(df.index, df['price'], df['size'])),
                            dtype=[('timestamp', 'datetime64[ns]'), ('price', 'float'), ('size', 'float')])

print(structured_array)
structured_array

# ticks = df[['index', 'price', 'size']].to_numpy()
# # ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000  # 
# ticks
/var/folders/8p/dwqnp65s0s77jdbm4_6z4vp80000gn/T/ipykernel_52602/3341929382.py:2: DeprecationWarning: parsing timezone aware datetimes is deprecated; this will raise an error in the future
  structured_array = np.array(list(zip(df.index, df['price'], df['size'])),
[('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00)
 ('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00)
 ('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00) ...
 ('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03)
 ('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01)
 ('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)]
Out[46]:
array([('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00),
       ('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00),
       ('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00), ...,
       ('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03),
       ('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01),
       ('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)],
      dtype=[('timestamp', '<M8[ns]'), ('price', '<f8'), ('size', '<f8')])
In [ ]:
resolution_seconds = 1  # 1 second resolution
ohlcv_data = ohlcv_bars(structured_array, resolution_seconds)

# Converting the result back to DataFrame for better usability
ohlcv_df = pd.DataFrame(ohlcv_data, columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Time'])
ohlcv_df['Time'] = pd.to_datetime(ohlcv_df['Time'], unit='s')  # Convert timestamps back to datetime