v2realbot/vectorized_loader.ipynb at 031b2427b9c0a1b78e831379a21000beb7bd094a

Files

David Brazda 6b2a4bb066 update of vbt doc

2024-04-25 06:24:51 +02:00

22 KiB

Raw Blame History

In [37]:

import pandas as pd
import pyarrow
import numpy as np
from numba import jit
import v2realbot.utils.config_handler as cfh

Další info k pokračování je zde https://blog.quantinsti.com/tick-tick-ohlc-data-pandas-tutorial/

In [38]:

tdf=pd.read_parquet('trades_bac.parquet',engine='pyarrow')
#print(df)
df = tdf.loc['BAC']
df.info()
df

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   exchange    190261 non-null  object 
 1   price       190261 non-null  float64
 2   size        190261 non-null  float64
 3   id          190261 non-null  int64  
 4   conditions  190261 non-null  object 
 5   tape        190261 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 10.2+ MB

Out[38]:

	exchange	price	size	id	conditions	tape
timestamp
2024-04-22 13:30:00.267711+00:00	K	36.890	5.0	52983525037630	[ , F, I]	A
2024-04-22 13:30:00.300501+00:00	D	37.005	1.0	71675241117014	[ , I]	A
2024-04-22 13:30:00.305439+00:00	D	37.005	1.0	71675241117496	[ , I]	A
2024-04-22 13:30:00.314520+00:00	D	37.005	1.0	71675241118034	[ , I]	A
2024-04-22 13:30:00.335201+00:00	D	37.005	1.0	71675241121369	[ , I]	A
...	...	...	...	...	...	...
2024-04-22 19:59:59.902614+00:00	V	37.750	1100.0	56480705310575	[ ]	A
2024-04-22 19:59:59.977134+00:00	N	37.745	300.0	52983559963478	[ ]	A
2024-04-22 19:59:59.977137+00:00	N	37.740	7300.0	52983559963696	[ ]	A
2024-04-22 19:59:59.978626+00:00	V	37.750	16.0	56480706886228	[ , I]	A
2024-04-22 19:59:59.987614+00:00	N	37.745	30.0	52983559963958	[ , I]	A

190261 rows × 6 columns

In [39]:

@jit(nopython=True)
def ohlcv_bars(ticks, start_time, end_time, resolution):
    """
    Generate OHLCV bars from tick data, skipping intervals without trading activity.
    
    Parameters:
    - ticks: numpy array with columns [timestamp, price, size]
    - start_time: the start timestamp for bars (Unix timestamp)
    - end_time: the end timestamp for bars (Unix timestamp)
    - resolution: time resolution in seconds
    
    Returns:
    - OHLCV bars as a numpy array
    """
    num_bars = (end_time - start_time) // resolution + 1
    bar_list = []

    for i in range(num_bars):
        bar_start_time = start_time + i * resolution
        bar_end_time = bar_start_time + resolution
        bar_ticks = ticks[(ticks[:, 0] >= bar_start_time) & (ticks[:, 0] < bar_end_time)]
        
        if bar_ticks.shape[0] == 0:
            continue  # Skip this bar as there are no ticks

        # Calculate OHLCV values
        open_price = bar_ticks[0, 1]  # open
        high_price = np.max(bar_ticks[:, 1])  # high
        low_price = np.min(bar_ticks[:, 1])  # low
        close_price = bar_ticks[-1, 1]  # close
        volume = np.sum(bar_ticks[:, 2])  # volume
        bar_time = bar_start_time  # timestamp for the bar

        bar_list.append([open_price, high_price, low_price, close_price, volume, bar_time])

    # Convert list to numpy array
    if bar_list:
        ohlcv = np.array(bar_list)
    else:
        ohlcv = np.empty((0, 6))  # return an empty array if no bars were created

    return ohlcv

In [40]:

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   exchange    190261 non-null  object 
 1   price       190261 non-null  float64
 2   size        190261 non-null  float64
 3   id          190261 non-null  int64  
 4   conditions  190261 non-null  object 
 5   tape        190261 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 10.2+ MB

In [41]:

excludes = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES')
print(excludes)
#excludes = ["F", "I"]
# FILTER EXCLUDED TRADES
# Filter rows to exclude those where 'conditions' contains 'F' or 'I'
# This simplifies the logic by directly using ~ (bitwise not operator) with np.isin
df = df[~df['conditions'].apply(lambda x: np.isin(x, excludes).any())]

['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F']
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 143751 entries, 2024-04-22 13:30:00.300501+00:00 to 2024-04-22 19:59:59.987614+00:00
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   exchange    143751 non-null  object 
 1   price       143751 non-null  float64
 2   size        143751 non-null  float64
 3   id          143751 non-null  int64  
 4   conditions  143751 non-null  object 
 5   tape        143751 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 7.7+ MB

Out[41]:

	exchange	price	size	id	conditions	tape
timestamp
2024-04-22 13:30:00.300501+00:00	D	37.005	1.0	71675241117014	[ , I]	A
2024-04-22 13:30:00.305439+00:00	D	37.005	1.0	71675241117496	[ , I]	A
2024-04-22 13:30:00.314520+00:00	D	37.005	1.0	71675241118034	[ , I]	A
2024-04-22 13:30:00.335201+00:00	D	37.005	1.0	71675241121369	[ , I]	A
2024-04-22 13:30:00.346219+00:00	D	37.005	1.0	71675241122389	[ , I]	A
...	...	...	...	...	...	...
2024-04-22 19:59:59.902614+00:00	V	37.750	1100.0	56480705310575	[ ]	A
2024-04-22 19:59:59.977134+00:00	N	37.745	300.0	52983559963478	[ ]	A
2024-04-22 19:59:59.977137+00:00	N	37.740	7300.0	52983559963696	[ ]	A
2024-04-22 19:59:59.978626+00:00	V	37.750	16.0	56480706886228	[ , I]	A
2024-04-22 19:59:59.987614+00:00	N	37.745	30.0	52983559963958	[ , I]	A

143751 rows × 6 columns

In [46]:

# Creating a structured array with the timestamp as the first element
structured_array = np.array(list(zip(df.index, df['price'], df['size'])),
                            dtype=[('timestamp', 'datetime64[ns]'), ('price', 'float'), ('size', 'float')])

print(structured_array)
structured_array

# ticks = df[['index', 'price', 'size']].to_numpy()
# # ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000  # 
# ticks

/var/folders/8p/dwqnp65s0s77jdbm4_6z4vp80000gn/T/ipykernel_52602/3341929382.py:2: DeprecationWarning: parsing timezone aware datetimes is deprecated; this will raise an error in the future
  structured_array = np.array(list(zip(df.index, df['price'], df['size'])),

[('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00)
 ('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00)
 ('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00) ...
 ('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03)
 ('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01)
 ('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)]

Out[46]:

array([('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00),
       ('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00),
       ('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00), ...,
       ('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03),
       ('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01),
       ('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)],
      dtype=[('timestamp', '<M8[ns]'), ('price', '<f8'), ('size', '<f8')])

In [ ]:

resolution_seconds = 1  # 1 second resolution
ohlcv_data = ohlcv_bars(structured_array, resolution_seconds)

# Converting the result back to DataFrame for better usability
ohlcv_df = pd.DataFrame(ohlcv_data, columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Time'])
ohlcv_df['Time'] = pd.to_datetime(ohlcv_df['Time'], unit='s')  # Convert timestamps back to datetime

22 KiB Raw Blame History Unescape Escape

22 KiB

Raw Blame History