commit po pul roce
This commit is contained in:
243
features_targets.md
Normal file
243
features_targets.md
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
|
||||||
|
|
||||||
|
# Things to try
|
||||||
|
|
||||||
|
TODO:
|
||||||
|
* lepsi labeling
|
||||||
|
* continue here https://claude.ai/chat/b3ee78b6-9662-4f25-95f0-ecac4a78a41b
|
||||||
|
* try model with other symbols
|
||||||
|
* rey different retraining options (even hourly)
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- add datetime features (useful for rush hour model)
|
||||||
|
- add MT features as columns
|
||||||
|
- use convolutional networks to create features (https://www.youtube.com/watch?v=6wK4q8QvsV4)
|
||||||
|
Enhance model:
|
||||||
|
* multi target see xgb doc
|
||||||
|
* use SL with target price, with validy for few seconds
|
||||||
|
* how handle imbalanced datase https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html
|
||||||
|
|
||||||
|
|
||||||
|
Target:
|
||||||
|
- maybe add manual labeling
|
||||||
|
|
||||||
|
# Features
|
||||||
|
|
||||||
|
```python
|
||||||
|
|
||||||
|
def prepare_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, list]:
|
||||||
|
"""Prepare enhanced features from input df with focus on predictive potential"""
|
||||||
|
features = pd.DataFrame(index=df.index)
|
||||||
|
|
||||||
|
# Original ohlcv added to features
|
||||||
|
features['close'] = df['close']
|
||||||
|
features['volume'] = df['volume']
|
||||||
|
features['trades_count'] = df['trades']
|
||||||
|
features['buy_volume'] = df['buyvolume']
|
||||||
|
features['sell_volume'] = df['sellvolume']
|
||||||
|
features['high'] = df['high']
|
||||||
|
features['low'] = df['low']
|
||||||
|
# features['log_return'] = np.log(features['close'] / features['close'].shift(1))
|
||||||
|
# features['returns_1'] = features['close'].pct_change()
|
||||||
|
# features['returns_5'] = features['close'].pct_change(5)
|
||||||
|
# features['returns_20'] = features['close'].pct_change(20)
|
||||||
|
|
||||||
|
def get_fib_windows():
|
||||||
|
"""
|
||||||
|
#TODO based on real time (originally for 1s bars)
|
||||||
|
|
||||||
|
Generate Fibonacci sequence windows up to ~1 hour (3600 seconds)
|
||||||
|
Returns sequence: 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584
|
||||||
|
"""
|
||||||
|
fib_windows = [3, 5]
|
||||||
|
while fib_windows[-1] < 3600/60:
|
||||||
|
next_fib = fib_windows[-1] + fib_windows[-2]
|
||||||
|
if next_fib > 3600/60:
|
||||||
|
break
|
||||||
|
fib_windows.append(next_fib)
|
||||||
|
return fib_windows
|
||||||
|
|
||||||
|
fib_windows = get_fib_windows()
|
||||||
|
|
||||||
|
# Base price and returns
|
||||||
|
features['log_return'] = np.log(features['close'] / features['close'].shift(1))
|
||||||
|
features['price_velocity'] = (features['close'] - features['close'].shift(1)) / 1.0 # per second
|
||||||
|
features['price_acceleration'] = features['price_velocity'] - features['price_velocity'].shift(1)
|
||||||
|
|
||||||
|
# Fibonacci-based features
|
||||||
|
for window in fib_windows:
|
||||||
|
# Price features
|
||||||
|
features[f'log_return_{window}s'] = np.log(features['close'] / features['close'].shift(window))
|
||||||
|
features[f'volatility_{window}s'] = features['log_return'].rolling(window).std()
|
||||||
|
features[f'range_{window}s'] = (features['high'].rolling(window).max() -
|
||||||
|
features['low'].rolling(window).min()) / features['close']
|
||||||
|
|
||||||
|
# Volume features
|
||||||
|
features[f'volume_momentum_{window}s'] = (
|
||||||
|
features['volume'].rolling(window).mean() /
|
||||||
|
features['volume'].rolling(window * 2).mean()
|
||||||
|
)
|
||||||
|
|
||||||
|
features[f'buy_volume_momentum_{window}s'] = (
|
||||||
|
features['buy_volume'].rolling(window).mean() /
|
||||||
|
features['buy_volume'].rolling(window * 2).mean()
|
||||||
|
)
|
||||||
|
|
||||||
|
features[f'sell_volume_momentum_{window}s'] = (
|
||||||
|
features['sell_volume'].rolling(window).mean() /
|
||||||
|
features['sell_volume'].rolling(window * 2).mean()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Trade features
|
||||||
|
features[f'trade_intensity_{window}s'] = (
|
||||||
|
features['trades_count'].rolling(window).mean() /
|
||||||
|
features['trades_count'].rolling(window * 2).mean()
|
||||||
|
)
|
||||||
|
|
||||||
|
features[f'avg_trade_size_{window}s'] = (
|
||||||
|
features['volume'].rolling(window).sum() /
|
||||||
|
features['trades_count'].rolling(window).sum()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Order flow features
|
||||||
|
features[f'cum_volume_delta_{window}s'] = (
|
||||||
|
features['buy_volume'] - features['sell_volume']
|
||||||
|
).rolling(window).sum()
|
||||||
|
|
||||||
|
features[f'volume_pressure_{window}s'] = (
|
||||||
|
features['buy_volume'].rolling(window).sum() /
|
||||||
|
features['sell_volume'].rolling(window).sum()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Price efficiency
|
||||||
|
features[f'price_efficiency_{window}s'] = (
|
||||||
|
np.abs(features['close'] - features['close'].shift(window)) /
|
||||||
|
(features['high'].rolling(window).max() - features['low'].rolling(window).min())
|
||||||
|
)
|
||||||
|
|
||||||
|
# Moving averages and their crosses
|
||||||
|
features[f'sma_{window}s'] = features['close'].rolling(window).mean()
|
||||||
|
if window > 5: # Create MA crosses with shorter timeframe
|
||||||
|
features[f'ma_cross_5_{window}s'] = (
|
||||||
|
features['close'].rolling(5).mean() -
|
||||||
|
features['close'].rolling(window).mean()
|
||||||
|
)
|
||||||
|
|
||||||
|
# MA-based features
|
||||||
|
ma_lengths = [5, 10, 20, 50]
|
||||||
|
for length in ma_lengths:
|
||||||
|
# Regular MAs
|
||||||
|
features[f'ma_{length}'] = features['close'].rolling(length).mean()
|
||||||
|
|
||||||
|
# MA slopes (rate of change)
|
||||||
|
features[f'ma_{length}_slope'] = features[f'ma_{length}'].pct_change(3)
|
||||||
|
|
||||||
|
# Price distance from MA
|
||||||
|
features[f'price_ma_{length}_dist'] = (features['close'] - features[f'ma_{length}']) / features[f'ma_{length}']
|
||||||
|
|
||||||
|
# MA crossovers
|
||||||
|
if length > 5:
|
||||||
|
features[f'ma_5_{length}_cross'] = (features['ma_5'] - features[f'ma_{length}']) / features[f'ma_{length}']
|
||||||
|
|
||||||
|
# MA convergence/divergence
|
||||||
|
features['ma_convergence'] = ((features['ma_5'] - features['ma_20']).abs() /
|
||||||
|
features['ma_20'].rolling(10).mean())
|
||||||
|
|
||||||
|
# Volatility features using MAs
|
||||||
|
features['ma_volatility'] = features['ma_5'].rolling(10).std() / features['ma_20']
|
||||||
|
|
||||||
|
# MA momentum
|
||||||
|
features['ma_momentum'] = (features['ma_5'] / features['ma_5'].shift(5) - 1) * 100
|
||||||
|
|
||||||
|
|
||||||
|
# Cleanup and feature selection
|
||||||
|
features = features.replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
|
lookback = 1000
|
||||||
|
if len(features) > lookback:
|
||||||
|
rolling_corr = features.iloc[-lookback:].corr().abs()
|
||||||
|
upper = rolling_corr.where(np.triu(np.ones(rolling_corr.shape), k=1).astype(bool))
|
||||||
|
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
|
||||||
|
print(f"Column highly correlated - maybe drop? {to_drop} ")
|
||||||
|
#features = features.drop(columns=to_drop)
|
||||||
|
|
||||||
|
feature_columns = list(features.columns)
|
||||||
|
print(f"Features shape before dropna: {features.shape}")
|
||||||
|
|
||||||
|
return features.dropna(), feature_columns
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Targets
|
||||||
|
## Unbalanced classes
|
||||||
|
|
||||||
|
```python
|
||||||
|
from xgboost import XGBClassifier
|
||||||
|
|
||||||
|
# Compute scale_pos_weight
|
||||||
|
n_0 = sum(y_train == 0)
|
||||||
|
n_1 = sum(y_train == 1)
|
||||||
|
scale_pos_weight = n_0 / n_1
|
||||||
|
|
||||||
|
model = XGBClassifier(scale_pos_weight=scale_pos_weight, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
def create_target_regressor(self, df: pd.DataFrame) -> pd.Series:
|
||||||
|
"""
|
||||||
|
https://claude.ai/chat/8e7fe81c-ddbe-4e64-9af0-2bc4764fc5f0
|
||||||
|
|
||||||
|
Creates enhanced target variable using adaptive returns based on market conditions.
|
||||||
|
Key improvements:
|
||||||
|
1. Multi-timeframe momentum approach
|
||||||
|
2. Volume-volatility regime adaptation
|
||||||
|
3. Trend-following vs mean-reversion regime detection
|
||||||
|
4. Noise reduction through sophisticated filtering
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
df : pd.DataFrame
|
||||||
|
Features df containing required columns: 'close', 'volume', volatility features
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
--------
|
||||||
|
pd.Series
|
||||||
|
Enhanced target variable with cross-day targets removed
|
||||||
|
"""
|
||||||
|
|
||||||
|
future_bars= self.config.forward_bars
|
||||||
|
|
||||||
|
future_ma_fast = df['close'].shift(-future_bars).rolling(5).mean()
|
||||||
|
|
||||||
|
# Calculate forward returns (original approach)
|
||||||
|
forward_returns = df['close'].shift(-future_bars) / df['close'] - 1
|
||||||
|
|
||||||
|
target = forward_returns
|
||||||
|
|
||||||
|
# 6. Advanced noise reduction
|
||||||
|
# Use exponential moving standard deviation for dynamic thresholds
|
||||||
|
target_std = target.ewm(span=50, min_periods=20).std()
|
||||||
|
|
||||||
|
# Adaptive thresholds based on rolling standard deviation
|
||||||
|
upper_clip = 2.5 * target_std
|
||||||
|
lower_clip = -2.5 * target_std
|
||||||
|
|
||||||
|
# Apply soft clipping using hyperbolic tangent
|
||||||
|
target = target_std * np.tanh(target / target_std)
|
||||||
|
|
||||||
|
# Final hard clips for extreme outliers
|
||||||
|
target = target.clip(lower=lower_clip, upper=upper_clip)
|
||||||
|
|
||||||
|
|
||||||
|
# 7. Remove cross-day targets and intraday seasonality
|
||||||
|
target = self.remove_crossday_targets(target, df, future_bars)
|
||||||
|
|
||||||
|
#only 10% of extreme values from both sides are kept
|
||||||
|
#target = target.where((target > target.quantile(0.9)) | (target < target.quantile(0.1)), 0)
|
||||||
|
|
||||||
|
print("after target generation", target.index[[0, -1]])
|
||||||
|
|
||||||
|
return target
|
||||||
|
```
|
||||||
BIN
image-1.png
Normal file
BIN
image-1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 83 KiB |
317
ml-snippets.md
Normal file
317
ml-snippets.md
Normal file
@ -0,0 +1,317 @@
|
|||||||
|
- [Features](#features)
|
||||||
|
- [Features analysis](#features-analysis)
|
||||||
|
- [Target to classes](#target-to-classes)
|
||||||
|
- [Features importance](#features-importance)
|
||||||
|
- [Features selection](#features-selection)
|
||||||
|
- [Prediction](#prediction)
|
||||||
|
- [evaluation](#evaluation)
|
||||||
|
- [calculated returns based on various probability prediction thresholda](#calculated-returns-based-on-various-probability-prediction-thresholda)
|
||||||
|
- [cumulative returns bases od prob predictions](#cumulative-returns-bases-od-prob-predictions)
|
||||||
|
- [charts](#charts)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Features
|
||||||
|
|
||||||
|
## Features analysis
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Calculate different percentiles
|
||||||
|
percentiles = [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99]
|
||||||
|
print("\nPercentiles:")
|
||||||
|
for p in percentiles:
|
||||||
|
print(f"{p}th percentile: {df['target'].quantile(p/100):.6f}")
|
||||||
|
|
||||||
|
# Plot distribution
|
||||||
|
plt.figure(figsize=(15, 10))
|
||||||
|
|
||||||
|
# Plot 1: Overall distribution
|
||||||
|
plt.subplot(2, 2, 1)
|
||||||
|
sns.histplot(df['target'], bins=100)
|
||||||
|
plt.title('Distribution of Returns')
|
||||||
|
plt.axvline(x=0, color='r', linestyle='--', alpha=0.5)
|
||||||
|
|
||||||
|
# Plot 2: Distribution with potential thresholds
|
||||||
|
plt.subplot(2, 2, 2)
|
||||||
|
sns.histplot(df['target'], bins=100)
|
||||||
|
plt.title('Distribution with Potential Thresholds')
|
||||||
|
|
||||||
|
# Add lines for different standard deviations
|
||||||
|
std = df['target'].std()
|
||||||
|
mean = df['target'].mean()
|
||||||
|
for i in [0.5, 1.0, 1.5]:
|
||||||
|
plt.axvline(x=mean + i*std, color='g', linestyle='--', alpha=0.3, label=f'+{i} std')
|
||||||
|
plt.axvline(x=mean - i*std, color='r', linestyle='--', alpha=0.3, label=f'-{i} std')
|
||||||
|
plt.legend()
|
||||||
|
|
||||||
|
# Let's try different threshold approaches
|
||||||
|
# Approach 1: Standard deviation based
|
||||||
|
std_multiplier = 0.2
|
||||||
|
std_threshold = std_multiplier * std
|
||||||
|
labels_std = np.where(df['target'] > std_threshold, 1,
|
||||||
|
np.where(df['target'] < -std_threshold, -1, 0))
|
||||||
|
|
||||||
|
# Approach 2: Percentile based
|
||||||
|
percentile_threshold = 0.2 # top/bottom 20%
|
||||||
|
top_threshold = df['target'].quantile(1 - percentile_threshold)
|
||||||
|
bottom_threshold = df['target'].quantile(percentile_threshold)
|
||||||
|
labels_percentile = np.where(df['target'] > top_threshold, 1,
|
||||||
|
np.where(df['target'] < bottom_threshold, -1, 0))
|
||||||
|
|
||||||
|
# Plot 3: Distribution of STD-based classes
|
||||||
|
plt.subplot(2, 2, 3)
|
||||||
|
sns.histplot(data=pd.DataFrame({'return': df['target'], 'class': labels_std}),
|
||||||
|
x='return', hue='class', bins=100)
|
||||||
|
plt.title(f'Classes Based on {std_multiplier} Standard Deviation')
|
||||||
|
plt.axvline(x=std_threshold, color='g', linestyle='--', alpha=0.5)
|
||||||
|
plt.axvline(x=-std_threshold, color='r', linestyle='--', alpha=0.5)
|
||||||
|
|
||||||
|
# Plot 4: Distribution of Percentile-based classes
|
||||||
|
plt.subplot(2, 2, 4)
|
||||||
|
sns.histplot(data=pd.DataFrame({'return': df['target'], 'class': labels_percentile}),
|
||||||
|
x='return', hue='class', bins=100)
|
||||||
|
plt.title(f'Classes Based on {percentile_threshold*100}th Percentiles')
|
||||||
|
plt.axvline(x=top_threshold, color='g', linestyle='--', alpha=0.5)
|
||||||
|
plt.axvline(x=bottom_threshold, color='r', linestyle='--', alpha=0.5)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Print class distributions
|
||||||
|
print("\nClass Distribution (STD-based):")
|
||||||
|
print(pd.Series(labels_std).value_counts(normalize=True))
|
||||||
|
|
||||||
|
print("\nClass Distribution (Percentile-based):")
|
||||||
|
print(pd.Series(labels_percentile).value_counts(normalize=True))
|
||||||
|
|
||||||
|
# Calculate mean return for each class
|
||||||
|
print("\nMean Return by Class (STD-based):")
|
||||||
|
std_df = pd.DataFrame({'return': df['target'], 'class': labels_std})
|
||||||
|
print(std_df.groupby('class')['return'].mean())
|
||||||
|
|
||||||
|
print("\nMean Return by Class (Percentile-based):")
|
||||||
|
perc_df = pd.DataFrame({'return': df['target'], 'class': labels_percentile})
|
||||||
|
print(perc_df.groupby('class')['return'].mean())
|
||||||
|
```
|
||||||
|
<img src="image-1.png" alt="Target distributions" width="300"/>
|
||||||
|
|
||||||
|
### Target to classes
|
||||||
|
|
||||||
|
Based on std dev
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Read and prepare the data
|
||||||
|
df = pd.read_csv('model_data.csv')
|
||||||
|
df = df.drop('ts_event', axis=1)
|
||||||
|
|
||||||
|
# Separate features and target
|
||||||
|
X = df.drop('target', axis=1)
|
||||||
|
y = df['target']
|
||||||
|
|
||||||
|
# Split the data first so we only use train data statistics for thresholds
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||||
|
|
||||||
|
# Calculate threshold based on training data only
|
||||||
|
train_std = y_train.std()
|
||||||
|
threshold = 0.2 * train_std
|
||||||
|
|
||||||
|
# Transform targets into classes (update this function) instead of -1,0,1 do 0,1,2
|
||||||
|
def create_labels(y, threshold):
|
||||||
|
return np.where(y > threshold, 2,
|
||||||
|
np.where(y < -threshold, 0, 1))
|
||||||
|
|
||||||
|
y_train_classes = create_labels(y_train, threshold)
|
||||||
|
y_test_classes = create_labels(y_test, threshold)
|
||||||
|
|
||||||
|
# Print class distribution
|
||||||
|
print("Training Class Distribution:")
|
||||||
|
print(pd.Series(y_train_classes).value_counts(normalize=True))
|
||||||
|
print("\nTest Class Distribution:")
|
||||||
|
print(pd.Series(y_test_classes).value_counts(normalize=True))
|
||||||
|
```
|
||||||
|
|
||||||
|
based on percentile/threshold
|
||||||
|
|
||||||
|
## Features importance
|
||||||
|
|
||||||
|
```python
|
||||||
|
#XGB top 20 feature importance
|
||||||
|
feature_importance = pd.DataFrame({
|
||||||
|
'feature': X.columns,
|
||||||
|
'importance': xgb_model.feature_importances_
|
||||||
|
})
|
||||||
|
feature_importance = feature_importance.sort_values('importance', ascending=False).head(20)
|
||||||
|
|
||||||
|
plt.figure(figsize=(12, 6))
|
||||||
|
sns.barplot(x='importance', y='feature', data=feature_importance)
|
||||||
|
plt.title('Top 20 Most Important Features')
|
||||||
|
plt.xlabel('Feature Importance')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Features selection
|
||||||
|
|
||||||
|
# Prediction
|
||||||
|
|
||||||
|
## evaluation
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Calculate directional accuracy
|
||||||
|
directional_accuracy = (np.sign(y_pred) == np.sign(y_test)).mean()
|
||||||
|
print(f"Directional Accuracy: {directional_accuracy:.4f}")
|
||||||
|
|
||||||
|
#confusion matrix
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
# Plot confusion matrix
|
||||||
|
plt.figure(figsize=(10, 8))
|
||||||
|
cm = confusion_matrix(y_test_classes, y_pred)
|
||||||
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
||||||
|
plt.title('Confusion Matrix')
|
||||||
|
plt.ylabel('True Label')
|
||||||
|
plt.xlabel('Predicted Label')
|
||||||
|
plt.show()
|
||||||
|
```
|
||||||
|
### calculated returns based on various probability prediction thresholda
|
||||||
|
|
||||||
|
```python
|
||||||
|
# .predict_proba() gives the probabilities for each class
|
||||||
|
print("Predicted probabilities:", model.predict_proba(X_test))
|
||||||
|
# Output example:
|
||||||
|
# [
|
||||||
|
# [0.35, 0.65], # 35% not spam, 65% spam
|
||||||
|
# [0.70, 0.30], # 70% not spam, 30% spam
|
||||||
|
# [0.45, 0.55], # 45% not spam, 55% spam
|
||||||
|
# ]
|
||||||
|
```
|
||||||
|
|
||||||
|
Chart probabilities
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Predict probabilities for each class
|
||||||
|
probabilities = model.predict_proba(X_test) # Shape: (n_samples, n_classes)
|
||||||
|
results_df = pd.DataFrame({
|
||||||
|
'Date': dates_test,
|
||||||
|
'Short Probability': probabilities[:, 0], # Probability of class 0 (short)
|
||||||
|
'Neutral Probability': probabilities[:, 1], # Probability of class 1 (neutral)
|
||||||
|
'Long Probability': probabilities[:, 2] # Probability of class 2 (long)
|
||||||
|
}).sort_values(by='Date') # Sort by date for time series plotting
|
||||||
|
|
||||||
|
fig = go.Figure()
|
||||||
|
|
||||||
|
# Add lines for each class probability
|
||||||
|
fig.add_trace(go.Scatter(
|
||||||
|
x=results_df['Date'], y=results_df['Short Probability'],
|
||||||
|
mode='lines', name='Short (Class 0)', line=dict(color='red')
|
||||||
|
))
|
||||||
|
|
||||||
|
fig.add_trace(go.Scatter(
|
||||||
|
x=results_df['Date'], y=results_df['Neutral Probability'],
|
||||||
|
mode='lines', name='Neutral (Class 1)', line=dict(color='orange')
|
||||||
|
))
|
||||||
|
|
||||||
|
fig.add_trace(go.Scatter(
|
||||||
|
x=results_df['Date'], y=results_df['Long Probability'],
|
||||||
|
mode='lines', name='Long (Class 2)', line=dict(color='green')
|
||||||
|
))
|
||||||
|
|
||||||
|
# Add title and labels
|
||||||
|
fig.update_layout(
|
||||||
|
title="Time Series of Predicted Class Probabilities",
|
||||||
|
xaxis_title="Date",
|
||||||
|
yaxis_title="Probability",
|
||||||
|
legend_title="Class"
|
||||||
|
)
|
||||||
|
|
||||||
|
fig.show()
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### cumulative returns bases od prob predictions
|
||||||
|
```python
|
||||||
|
# Calculate returns based on probablity predictions
|
||||||
|
def calculate_returns(predictions, actual_returns, confidence_threshold=0.0):
|
||||||
|
pred_probs = final_model.predict_proba(X_test_selected)
|
||||||
|
max_probs = np.max(pred_probs, axis=1)
|
||||||
|
|
||||||
|
# Only take positions when confidence exceeds threshold
|
||||||
|
positions = np.zeros_like(predictions, dtype=float)
|
||||||
|
confident_mask = max_probs > confidence_threshold
|
||||||
|
|
||||||
|
# Convert predictions 0->-1, 2->1 for returns calculation
|
||||||
|
adj_predictions = np.where(predictions == 2, 1, np.where(predictions == 0, -1, 0))
|
||||||
|
positions[confident_mask] = adj_predictions[confident_mask]
|
||||||
|
|
||||||
|
returns = positions * actual_returns
|
||||||
|
return returns, np.mean(confident_mask)
|
||||||
|
|
||||||
|
# Test different confidence thresholds
|
||||||
|
confidence_thresholds = [0.4, 0.5, 0.6, 0.7, 0.8]
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for conf_threshold in confidence_thresholds:
|
||||||
|
returns, coverage = calculate_returns(y_pred, y_test.values, conf_threshold)
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
sharpe = np.sqrt(252) * returns.mean() / returns.std()
|
||||||
|
accuracy = accuracy_score(y_test_classes[returns != 0],
|
||||||
|
y_pred[returns != 0])
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'confidence_threshold': conf_threshold,
|
||||||
|
'sharpe': sharpe,
|
||||||
|
'accuracy': accuracy,
|
||||||
|
'coverage': coverage
|
||||||
|
})
|
||||||
|
|
||||||
|
##Plot difference confidence threshodls
|
||||||
|
# Plot cumulative returns
|
||||||
|
plt.figure(figsize=(12, 6))
|
||||||
|
for th in confidence_thresholds:
|
||||||
|
returns, _ = calculate_returns(y_pred, y_test.values, th) # Using 0.6 confidence threshold
|
||||||
|
cumulative_returns = (1 + returns).cumprod()
|
||||||
|
plt.plot(cumulative_returns)
|
||||||
|
plt.title('Cumulative Returns (0.6 confidence threshold)')
|
||||||
|
plt.xlabel('Trade Number')
|
||||||
|
plt.ylabel('Cumulative Return')
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
results_df = pd.DataFrame(results)
|
||||||
|
print("\nPerformance at different confidence thresholds:")
|
||||||
|
print(results_df)
|
||||||
|
|
||||||
|
# Plot feature importance
|
||||||
|
importance_df = pd.DataFrame({
|
||||||
|
'feature': selected_features,
|
||||||
|
'importance': final_model.feature_importances_
|
||||||
|
})
|
||||||
|
importance_df = importance_df.sort_values('importance', ascending=False)
|
||||||
|
|
||||||
|
plt.figure(figsize=(12, 6))
|
||||||
|
sns.barplot(x='importance', y='feature', data=importance_df)
|
||||||
|
plt.title('Feature Importance')
|
||||||
|
plt.xlabel('Importance')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## charts
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Actual vs predicted values
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
plt.scatter(y_test, y_pred, alpha=0.5)
|
||||||
|
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
|
||||||
|
plt.xlabel('Actual Returns')
|
||||||
|
plt.ylabel('Predicted Returns')
|
||||||
|
plt.title('Actual vs Predicted Returns')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
```
|
||||||
@ -132,7 +132,35 @@ basic_data = vbt.Data.from_data(vbt.symbol_dict({"BAC": ohlcv_df}), tz_convert=z
|
|||||||
basic_data.wrapper.index.normalize().nunique() #numdays
|
basic_data.wrapper.index.normalize().nunique() #numdays
|
||||||
|
|
||||||
#Fetching Trades and Aggregating custom OHLCV
|
#Fetching Trades and Aggregating custom OHLCV
|
||||||
TBD
|
from ttools import load_data
|
||||||
|
#This is how to call LOAD function
|
||||||
|
symbol = ["SPY", "BAC"]
|
||||||
|
#datetime in zoneNY
|
||||||
|
day_start = datetime(2024, 1, 15, 9, 30, 0)
|
||||||
|
day_stop = datetime(2024, 10, 20, 16, 0, 0)
|
||||||
|
day_start = zoneNY.localize(day_start)
|
||||||
|
day_stop = zoneNY.localize(day_stop)
|
||||||
|
|
||||||
|
#requested AGG
|
||||||
|
resolution = 1 #12s bars
|
||||||
|
agg_type = AggType.OHLCV #other types AggType.OHLCV_VOL, AggType.OHLCV_DOL, AggType.OHLCV_RENKO
|
||||||
|
exclude_conditions = ['C','O','4','B','7','V','P','W','U','Z','F','9','M','6'] #None to defaults
|
||||||
|
minsize = 100 #min trade size to include
|
||||||
|
main_session_only = False
|
||||||
|
force_remote = False
|
||||||
|
|
||||||
|
data = load_data(symbol = symbol,
|
||||||
|
agg_type = agg_type,
|
||||||
|
resolution = resolution,
|
||||||
|
start_date = day_start,
|
||||||
|
end_date = day_stop,
|
||||||
|
#exclude_conditions = None,
|
||||||
|
minsize = minsize,
|
||||||
|
main_session_only = main_session_only,
|
||||||
|
force_remote = force_remote,
|
||||||
|
return_vbt = True, #returns vbt object
|
||||||
|
verbose = True
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
## REINDEX to main session
|
## REINDEX to main session
|
||||||
@ -266,7 +294,8 @@ _feature_config: tp.ClassVar[Config] = HybridConfig(
|
|||||||
|
|
||||||
basic_data._feature_config = _feature_config
|
basic_data._feature_config = _feature_config
|
||||||
```
|
```
|
||||||
ddd
|
|
||||||
|
```python
|
||||||
#1s to 1T
|
#1s to 1T
|
||||||
t1data = basic_data[['open', 'high', 'low', 'close', 'volume','vwap','buyvolume','trades','sellvolume']].resample("1T")
|
t1data = basic_data[['open', 'high', 'low', 'close', 'volume','vwap','buyvolume','trades','sellvolume']].resample("1T")
|
||||||
t1data = t1data.transform(lambda df: df.between_time('09:30', '16:00').dropna())
|
t1data = t1data.transform(lambda df: df.between_time('09:30', '16:00').dropna())
|
||||||
@ -275,6 +304,7 @@ t1data = t1data.transform(lambda df: df.between_time('09:30', '16:00').dropna())
|
|||||||
resampler_s = vbt.Resampler(target_data.index, source_data.index, source_freq="1T", target_freq="1s")
|
resampler_s = vbt.Resampler(target_data.index, source_data.index, source_freq="1T", target_freq="1s")
|
||||||
basic_data.resample(resampler_s)
|
basic_data.resample(resampler_s)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
# REALIGN
|
# REALIGN
|
||||||
|
|
||||||
@ -1383,6 +1413,8 @@ pf_stats.sort_values(by='Sharpe Ratio', ascending=False).iloc[::-1].vbt.heatmap(
|
|||||||
# UTILS
|
# UTILS
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
#use plotly resampler
|
||||||
|
vbt.settings.plotting["use_resampler"] = True
|
||||||
|
|
||||||
#RELOAD module in ipynb
|
#RELOAD module in ipynb
|
||||||
%load_ext autoreload
|
%load_ext autoreload
|
||||||
|
|||||||
Reference in New Issue
Block a user