243 lines
9.2 KiB
Markdown
243 lines
9.2 KiB
Markdown
|
|
|
|
# Things to try
|
|
|
|
TODO:
|
|
* lepsi labeling
|
|
* continue here https://claude.ai/chat/b3ee78b6-9662-4f25-95f0-ecac4a78a41b
|
|
* try model with other symbols
|
|
* rey different retraining options (even hourly)
|
|
|
|
Features:
|
|
- add datetime features (useful for rush hour model)
|
|
- add MT features as columns
|
|
- use convolutional networks to create features (https://www.youtube.com/watch?v=6wK4q8QvsV4)
|
|
Enhance model:
|
|
* multi target see xgb doc
|
|
* use SL with target price, with validy for few seconds
|
|
* how handle imbalanced datase https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html
|
|
|
|
|
|
Target:
|
|
- maybe add manual labeling
|
|
|
|
# Features
|
|
|
|
```python
|
|
|
|
def prepare_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, list]:
|
|
"""Prepare enhanced features from input df with focus on predictive potential"""
|
|
features = pd.DataFrame(index=df.index)
|
|
|
|
# Original ohlcv added to features
|
|
features['close'] = df['close']
|
|
features['volume'] = df['volume']
|
|
features['trades_count'] = df['trades']
|
|
features['buy_volume'] = df['buyvolume']
|
|
features['sell_volume'] = df['sellvolume']
|
|
features['high'] = df['high']
|
|
features['low'] = df['low']
|
|
# features['log_return'] = np.log(features['close'] / features['close'].shift(1))
|
|
# features['returns_1'] = features['close'].pct_change()
|
|
# features['returns_5'] = features['close'].pct_change(5)
|
|
# features['returns_20'] = features['close'].pct_change(20)
|
|
|
|
def get_fib_windows():
|
|
"""
|
|
#TODO based on real time (originally for 1s bars)
|
|
|
|
Generate Fibonacci sequence windows up to ~1 hour (3600 seconds)
|
|
Returns sequence: 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584
|
|
"""
|
|
fib_windows = [3, 5]
|
|
while fib_windows[-1] < 3600/60:
|
|
next_fib = fib_windows[-1] + fib_windows[-2]
|
|
if next_fib > 3600/60:
|
|
break
|
|
fib_windows.append(next_fib)
|
|
return fib_windows
|
|
|
|
fib_windows = get_fib_windows()
|
|
|
|
# Base price and returns
|
|
features['log_return'] = np.log(features['close'] / features['close'].shift(1))
|
|
features['price_velocity'] = (features['close'] - features['close'].shift(1)) / 1.0 # per second
|
|
features['price_acceleration'] = features['price_velocity'] - features['price_velocity'].shift(1)
|
|
|
|
# Fibonacci-based features
|
|
for window in fib_windows:
|
|
# Price features
|
|
features[f'log_return_{window}s'] = np.log(features['close'] / features['close'].shift(window))
|
|
features[f'volatility_{window}s'] = features['log_return'].rolling(window).std()
|
|
features[f'range_{window}s'] = (features['high'].rolling(window).max() -
|
|
features['low'].rolling(window).min()) / features['close']
|
|
|
|
# Volume features
|
|
features[f'volume_momentum_{window}s'] = (
|
|
features['volume'].rolling(window).mean() /
|
|
features['volume'].rolling(window * 2).mean()
|
|
)
|
|
|
|
features[f'buy_volume_momentum_{window}s'] = (
|
|
features['buy_volume'].rolling(window).mean() /
|
|
features['buy_volume'].rolling(window * 2).mean()
|
|
)
|
|
|
|
features[f'sell_volume_momentum_{window}s'] = (
|
|
features['sell_volume'].rolling(window).mean() /
|
|
features['sell_volume'].rolling(window * 2).mean()
|
|
)
|
|
|
|
# Trade features
|
|
features[f'trade_intensity_{window}s'] = (
|
|
features['trades_count'].rolling(window).mean() /
|
|
features['trades_count'].rolling(window * 2).mean()
|
|
)
|
|
|
|
features[f'avg_trade_size_{window}s'] = (
|
|
features['volume'].rolling(window).sum() /
|
|
features['trades_count'].rolling(window).sum()
|
|
)
|
|
|
|
# Order flow features
|
|
features[f'cum_volume_delta_{window}s'] = (
|
|
features['buy_volume'] - features['sell_volume']
|
|
).rolling(window).sum()
|
|
|
|
features[f'volume_pressure_{window}s'] = (
|
|
features['buy_volume'].rolling(window).sum() /
|
|
features['sell_volume'].rolling(window).sum()
|
|
)
|
|
|
|
# Price efficiency
|
|
features[f'price_efficiency_{window}s'] = (
|
|
np.abs(features['close'] - features['close'].shift(window)) /
|
|
(features['high'].rolling(window).max() - features['low'].rolling(window).min())
|
|
)
|
|
|
|
# Moving averages and their crosses
|
|
features[f'sma_{window}s'] = features['close'].rolling(window).mean()
|
|
if window > 5: # Create MA crosses with shorter timeframe
|
|
features[f'ma_cross_5_{window}s'] = (
|
|
features['close'].rolling(5).mean() -
|
|
features['close'].rolling(window).mean()
|
|
)
|
|
|
|
# MA-based features
|
|
ma_lengths = [5, 10, 20, 50]
|
|
for length in ma_lengths:
|
|
# Regular MAs
|
|
features[f'ma_{length}'] = features['close'].rolling(length).mean()
|
|
|
|
# MA slopes (rate of change)
|
|
features[f'ma_{length}_slope'] = features[f'ma_{length}'].pct_change(3)
|
|
|
|
# Price distance from MA
|
|
features[f'price_ma_{length}_dist'] = (features['close'] - features[f'ma_{length}']) / features[f'ma_{length}']
|
|
|
|
# MA crossovers
|
|
if length > 5:
|
|
features[f'ma_5_{length}_cross'] = (features['ma_5'] - features[f'ma_{length}']) / features[f'ma_{length}']
|
|
|
|
# MA convergence/divergence
|
|
features['ma_convergence'] = ((features['ma_5'] - features['ma_20']).abs() /
|
|
features['ma_20'].rolling(10).mean())
|
|
|
|
# Volatility features using MAs
|
|
features['ma_volatility'] = features['ma_5'].rolling(10).std() / features['ma_20']
|
|
|
|
# MA momentum
|
|
features['ma_momentum'] = (features['ma_5'] / features['ma_5'].shift(5) - 1) * 100
|
|
|
|
|
|
# Cleanup and feature selection
|
|
features = features.replace([np.inf, -np.inf], np.nan)
|
|
|
|
lookback = 1000
|
|
if len(features) > lookback:
|
|
rolling_corr = features.iloc[-lookback:].corr().abs()
|
|
upper = rolling_corr.where(np.triu(np.ones(rolling_corr.shape), k=1).astype(bool))
|
|
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
|
|
print(f"Column highly correlated - maybe drop? {to_drop} ")
|
|
#features = features.drop(columns=to_drop)
|
|
|
|
feature_columns = list(features.columns)
|
|
print(f"Features shape before dropna: {features.shape}")
|
|
|
|
return features.dropna(), feature_columns
|
|
```
|
|
|
|
|
|
|
|
# Targets
|
|
## Unbalanced classes
|
|
|
|
```python
|
|
from xgboost import XGBClassifier
|
|
|
|
# Compute scale_pos_weight
|
|
n_0 = sum(y_train == 0)
|
|
n_1 = sum(y_train == 1)
|
|
scale_pos_weight = n_0 / n_1
|
|
|
|
model = XGBClassifier(scale_pos_weight=scale_pos_weight, ...)
|
|
```
|
|
|
|
|
|
```python
|
|
def create_target_regressor(self, df: pd.DataFrame) -> pd.Series:
|
|
"""
|
|
https://claude.ai/chat/8e7fe81c-ddbe-4e64-9af0-2bc4764fc5f0
|
|
|
|
Creates enhanced target variable using adaptive returns based on market conditions.
|
|
Key improvements:
|
|
1. Multi-timeframe momentum approach
|
|
2. Volume-volatility regime adaptation
|
|
3. Trend-following vs mean-reversion regime detection
|
|
4. Noise reduction through sophisticated filtering
|
|
|
|
Parameters:
|
|
-----------
|
|
df : pd.DataFrame
|
|
Features df containing required columns: 'close', 'volume', volatility features
|
|
|
|
Returns:
|
|
--------
|
|
pd.Series
|
|
Enhanced target variable with cross-day targets removed
|
|
"""
|
|
|
|
future_bars= self.config.forward_bars
|
|
|
|
future_ma_fast = df['close'].shift(-future_bars).rolling(5).mean()
|
|
|
|
# Calculate forward returns (original approach)
|
|
forward_returns = df['close'].shift(-future_bars) / df['close'] - 1
|
|
|
|
target = forward_returns
|
|
|
|
# 6. Advanced noise reduction
|
|
# Use exponential moving standard deviation for dynamic thresholds
|
|
target_std = target.ewm(span=50, min_periods=20).std()
|
|
|
|
# Adaptive thresholds based on rolling standard deviation
|
|
upper_clip = 2.5 * target_std
|
|
lower_clip = -2.5 * target_std
|
|
|
|
# Apply soft clipping using hyperbolic tangent
|
|
target = target_std * np.tanh(target / target_std)
|
|
|
|
# Final hard clips for extreme outliers
|
|
target = target.clip(lower=lower_clip, upper=upper_clip)
|
|
|
|
|
|
# 7. Remove cross-day targets and intraday seasonality
|
|
target = self.remove_crossday_targets(target, df, future_bars)
|
|
|
|
#only 10% of extreme values from both sides are kept
|
|
#target = target.where((target > target.quantile(0.9)) | (target < target.quantile(0.1)), 0)
|
|
|
|
print("after target generation", target.index[[0, -1]])
|
|
|
|
return target
|
|
``` |