22 KiB
22 KiB
In [37]:
import pandas as pd import pyarrow import numpy as np from numba import jit import v2realbot.utils.config_handler as cfh
Další info k pokračování je zde https://blog.quantinsti.com/tick-tick-ohlc-data-pandas-tutorial/
In [38]:
tdf=pd.read_parquet('trades_bac.parquet',engine='pyarrow') #print(df) df = tdf.loc['BAC'] df.info() df
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 exchange 190261 non-null object 1 price 190261 non-null float64 2 size 190261 non-null float64 3 id 190261 non-null int64 4 conditions 190261 non-null object 5 tape 190261 non-null object dtypes: float64(2), int64(1), object(3) memory usage: 10.2+ MB
Out[38]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | |
|---|---|---|---|---|---|---|
| timestamp | ||||||
| 2024-04-22 13:30:00.267711+00:00 | K | 36.890 | 5.0 | 52983525037630 | [ , F, I] | A |
| 2024-04-22 13:30:00.300501+00:00 | D | 37.005 | 1.0 | 71675241117014 | [ , I] | A |
| 2024-04-22 13:30:00.305439+00:00 | D | 37.005 | 1.0 | 71675241117496 | [ , I] | A |
| 2024-04-22 13:30:00.314520+00:00 | D | 37.005 | 1.0 | 71675241118034 | [ , I] | A |
| 2024-04-22 13:30:00.335201+00:00 | D | 37.005 | 1.0 | 71675241121369 | [ , I] | A |
| ... | ... | ... | ... | ... | ... | ... |
| 2024-04-22 19:59:59.902614+00:00 | V | 37.750 | 1100.0 | 56480705310575 | [ ] | A |
| 2024-04-22 19:59:59.977134+00:00 | N | 37.745 | 300.0 | 52983559963478 | [ ] | A |
| 2024-04-22 19:59:59.977137+00:00 | N | 37.740 | 7300.0 | 52983559963696 | [ ] | A |
| 2024-04-22 19:59:59.978626+00:00 | V | 37.750 | 16.0 | 56480706886228 | [ , I] | A |
| 2024-04-22 19:59:59.987614+00:00 | N | 37.745 | 30.0 | 52983559963958 | [ , I] | A |
190261 rows × 6 columns
In [39]:
@jit(nopython=True) def ohlcv_bars(ticks, start_time, end_time, resolution): """ Generate OHLCV bars from tick data, skipping intervals without trading activity. Parameters: - ticks: numpy array with columns [timestamp, price, size] - start_time: the start timestamp for bars (Unix timestamp) - end_time: the end timestamp for bars (Unix timestamp) - resolution: time resolution in seconds Returns: - OHLCV bars as a numpy array """ num_bars = (end_time - start_time) // resolution + 1 bar_list = [] for i in range(num_bars): bar_start_time = start_time + i * resolution bar_end_time = bar_start_time + resolution bar_ticks = ticks[(ticks[:, 0] >= bar_start_time) & (ticks[:, 0] < bar_end_time)] if bar_ticks.shape[0] == 0: continue # Skip this bar as there are no ticks # Calculate OHLCV values open_price = bar_ticks[0, 1] # open high_price = np.max(bar_ticks[:, 1]) # high low_price = np.min(bar_ticks[:, 1]) # low close_price = bar_ticks[-1, 1] # close volume = np.sum(bar_ticks[:, 2]) # volume bar_time = bar_start_time # timestamp for the bar bar_list.append([open_price, high_price, low_price, close_price, volume, bar_time]) # Convert list to numpy array if bar_list: ohlcv = np.array(bar_list) else: ohlcv = np.empty((0, 6)) # return an empty array if no bars were created return ohlcv
In [40]:
df.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 190261 entries, 2024-04-22 13:30:00.267711+00:00 to 2024-04-22 19:59:59.987614+00:00 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 exchange 190261 non-null object 1 price 190261 non-null float64 2 size 190261 non-null float64 3 id 190261 non-null int64 4 conditions 190261 non-null object 5 tape 190261 non-null object dtypes: float64(2), int64(1), object(3) memory usage: 10.2+ MB
In [41]:
excludes = cfh.config_handler.get_val('AGG_EXCLUDED_TRADES') print(excludes) #excludes = ["F", "I"] # FILTER EXCLUDED TRADES # Filter rows to exclude those where 'conditions' contains 'F' or 'I' # This simplifies the logic by directly using ~ (bitwise not operator) with np.isin df = df[~df['conditions'].apply(lambda x: np.isin(x, excludes).any())]
['C', 'O', '4', 'B', '7', 'V', 'P', 'W', 'U', 'Z', 'F'] <class 'pandas.core.frame.DataFrame'> DatetimeIndex: 143751 entries, 2024-04-22 13:30:00.300501+00:00 to 2024-04-22 19:59:59.987614+00:00 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 exchange 143751 non-null object 1 price 143751 non-null float64 2 size 143751 non-null float64 3 id 143751 non-null int64 4 conditions 143751 non-null object 5 tape 143751 non-null object dtypes: float64(2), int64(1), object(3) memory usage: 7.7+ MB
Out[41]:
<style scoped="">
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| exchange | price | size | id | conditions | tape | |
|---|---|---|---|---|---|---|
| timestamp | ||||||
| 2024-04-22 13:30:00.300501+00:00 | D | 37.005 | 1.0 | 71675241117014 | [ , I] | A |
| 2024-04-22 13:30:00.305439+00:00 | D | 37.005 | 1.0 | 71675241117496 | [ , I] | A |
| 2024-04-22 13:30:00.314520+00:00 | D | 37.005 | 1.0 | 71675241118034 | [ , I] | A |
| 2024-04-22 13:30:00.335201+00:00 | D | 37.005 | 1.0 | 71675241121369 | [ , I] | A |
| 2024-04-22 13:30:00.346219+00:00 | D | 37.005 | 1.0 | 71675241122389 | [ , I] | A |
| ... | ... | ... | ... | ... | ... | ... |
| 2024-04-22 19:59:59.902614+00:00 | V | 37.750 | 1100.0 | 56480705310575 | [ ] | A |
| 2024-04-22 19:59:59.977134+00:00 | N | 37.745 | 300.0 | 52983559963478 | [ ] | A |
| 2024-04-22 19:59:59.977137+00:00 | N | 37.740 | 7300.0 | 52983559963696 | [ ] | A |
| 2024-04-22 19:59:59.978626+00:00 | V | 37.750 | 16.0 | 56480706886228 | [ , I] | A |
| 2024-04-22 19:59:59.987614+00:00 | N | 37.745 | 30.0 | 52983559963958 | [ , I] | A |
143751 rows × 6 columns
In [46]:
# Creating a structured array with the timestamp as the first element structured_array = np.array(list(zip(df.index, df['price'], df['size'])), dtype=[('timestamp', 'datetime64[ns]'), ('price', 'float'), ('size', 'float')]) print(structured_array) structured_array # ticks = df[['index', 'price', 'size']].to_numpy() # # ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000 # # ticks
/var/folders/8p/dwqnp65s0s77jdbm4_6z4vp80000gn/T/ipykernel_52602/3341929382.py:2: DeprecationWarning: parsing timezone aware datetimes is deprecated; this will raise an error in the future structured_array = np.array(list(zip(df.index, df['price'], df['size'])),
[('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00)
('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00)
('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00) ...
('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03)
('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01)
('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)]
Out[46]:
array([('2024-04-22T13:30:00.300501000', 37.005, 1.0e+00),
('2024-04-22T13:30:00.305439000', 37.005, 1.0e+00),
('2024-04-22T13:30:00.314520000', 37.005, 1.0e+00), ...,
('2024-04-22T19:59:59.977137000', 37.74 , 7.3e+03),
('2024-04-22T19:59:59.978626000', 37.75 , 1.6e+01),
('2024-04-22T19:59:59.987614000', 37.745, 3.0e+01)],
dtype=[('timestamp', '<M8[ns]'), ('price', '<f8'), ('size', '<f8')])
In [ ]:
resolution_seconds = 1 # 1 second resolution ohlcv_data = ohlcv_bars(structured_array, resolution_seconds) # Converting the result back to DataFrame for better usability ohlcv_df = pd.DataFrame(ohlcv_data, columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Time']) ohlcv_df['Time'] = pd.to_datetime(ohlcv_df['Time'], unit='s') # Convert timestamps back to datetime