diff --git a/features_targets.md b/features_targets.md new file mode 100644 index 0000000..48a73e6 --- /dev/null +++ b/features_targets.md @@ -0,0 +1,243 @@ + + +# Things to try + +TODO: +* lepsi labeling +* continue here https://claude.ai/chat/b3ee78b6-9662-4f25-95f0-ecac4a78a41b +* try model with other symbols +* rey different retraining options (even hourly) + +Features: +- add datetime features (useful for rush hour model) +- add MT features as columns +- use convolutional networks to create features (https://www.youtube.com/watch?v=6wK4q8QvsV4) +Enhance model: +* multi target see xgb doc +* use SL with target price, with validy for few seconds +* how handle imbalanced datase https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html + + +Target: +- maybe add manual labeling + +# Features + +```python + + def prepare_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, list]: + """Prepare enhanced features from input df with focus on predictive potential""" + features = pd.DataFrame(index=df.index) + + # Original ohlcv added to features + features['close'] = df['close'] + features['volume'] = df['volume'] + features['trades_count'] = df['trades'] + features['buy_volume'] = df['buyvolume'] + features['sell_volume'] = df['sellvolume'] + features['high'] = df['high'] + features['low'] = df['low'] + # features['log_return'] = np.log(features['close'] / features['close'].shift(1)) + # features['returns_1'] = features['close'].pct_change() + # features['returns_5'] = features['close'].pct_change(5) + # features['returns_20'] = features['close'].pct_change(20) + + def get_fib_windows(): + """ + #TODO based on real time (originally for 1s bars) + + Generate Fibonacci sequence windows up to ~1 hour (3600 seconds) + Returns sequence: 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584 + """ + fib_windows = [3, 5] + while fib_windows[-1] < 3600/60: + next_fib = fib_windows[-1] + fib_windows[-2] + if next_fib > 3600/60: + break + fib_windows.append(next_fib) + return fib_windows + + fib_windows = get_fib_windows() + + # Base price and returns + features['log_return'] = np.log(features['close'] / features['close'].shift(1)) + features['price_velocity'] = (features['close'] - features['close'].shift(1)) / 1.0 # per second + features['price_acceleration'] = features['price_velocity'] - features['price_velocity'].shift(1) + + # Fibonacci-based features + for window in fib_windows: + # Price features + features[f'log_return_{window}s'] = np.log(features['close'] / features['close'].shift(window)) + features[f'volatility_{window}s'] = features['log_return'].rolling(window).std() + features[f'range_{window}s'] = (features['high'].rolling(window).max() - + features['low'].rolling(window).min()) / features['close'] + + # Volume features + features[f'volume_momentum_{window}s'] = ( + features['volume'].rolling(window).mean() / + features['volume'].rolling(window * 2).mean() + ) + + features[f'buy_volume_momentum_{window}s'] = ( + features['buy_volume'].rolling(window).mean() / + features['buy_volume'].rolling(window * 2).mean() + ) + + features[f'sell_volume_momentum_{window}s'] = ( + features['sell_volume'].rolling(window).mean() / + features['sell_volume'].rolling(window * 2).mean() + ) + + # Trade features + features[f'trade_intensity_{window}s'] = ( + features['trades_count'].rolling(window).mean() / + features['trades_count'].rolling(window * 2).mean() + ) + + features[f'avg_trade_size_{window}s'] = ( + features['volume'].rolling(window).sum() / + features['trades_count'].rolling(window).sum() + ) + + # Order flow features + features[f'cum_volume_delta_{window}s'] = ( + features['buy_volume'] - features['sell_volume'] + ).rolling(window).sum() + + features[f'volume_pressure_{window}s'] = ( + features['buy_volume'].rolling(window).sum() / + features['sell_volume'].rolling(window).sum() + ) + + # Price efficiency + features[f'price_efficiency_{window}s'] = ( + np.abs(features['close'] - features['close'].shift(window)) / + (features['high'].rolling(window).max() - features['low'].rolling(window).min()) + ) + + # Moving averages and their crosses + features[f'sma_{window}s'] = features['close'].rolling(window).mean() + if window > 5: # Create MA crosses with shorter timeframe + features[f'ma_cross_5_{window}s'] = ( + features['close'].rolling(5).mean() - + features['close'].rolling(window).mean() + ) + + # MA-based features + ma_lengths = [5, 10, 20, 50] + for length in ma_lengths: + # Regular MAs + features[f'ma_{length}'] = features['close'].rolling(length).mean() + + # MA slopes (rate of change) + features[f'ma_{length}_slope'] = features[f'ma_{length}'].pct_change(3) + + # Price distance from MA + features[f'price_ma_{length}_dist'] = (features['close'] - features[f'ma_{length}']) / features[f'ma_{length}'] + + # MA crossovers + if length > 5: + features[f'ma_5_{length}_cross'] = (features['ma_5'] - features[f'ma_{length}']) / features[f'ma_{length}'] + + # MA convergence/divergence + features['ma_convergence'] = ((features['ma_5'] - features['ma_20']).abs() / + features['ma_20'].rolling(10).mean()) + + # Volatility features using MAs + features['ma_volatility'] = features['ma_5'].rolling(10).std() / features['ma_20'] + + # MA momentum + features['ma_momentum'] = (features['ma_5'] / features['ma_5'].shift(5) - 1) * 100 + + + # Cleanup and feature selection + features = features.replace([np.inf, -np.inf], np.nan) + + lookback = 1000 + if len(features) > lookback: + rolling_corr = features.iloc[-lookback:].corr().abs() + upper = rolling_corr.where(np.triu(np.ones(rolling_corr.shape), k=1).astype(bool)) + to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] + print(f"Column highly correlated - maybe drop? {to_drop} ") + #features = features.drop(columns=to_drop) + + feature_columns = list(features.columns) + print(f"Features shape before dropna: {features.shape}") + + return features.dropna(), feature_columns +``` + + + +# Targets +## Unbalanced classes + +```python +from xgboost import XGBClassifier + +# Compute scale_pos_weight +n_0 = sum(y_train == 0) +n_1 = sum(y_train == 1) +scale_pos_weight = n_0 / n_1 + +model = XGBClassifier(scale_pos_weight=scale_pos_weight, ...) +``` + + +```python + def create_target_regressor(self, df: pd.DataFrame) -> pd.Series: + """ + https://claude.ai/chat/8e7fe81c-ddbe-4e64-9af0-2bc4764fc5f0 + + Creates enhanced target variable using adaptive returns based on market conditions. + Key improvements: + 1. Multi-timeframe momentum approach + 2. Volume-volatility regime adaptation + 3. Trend-following vs mean-reversion regime detection + 4. Noise reduction through sophisticated filtering + + Parameters: + ----------- + df : pd.DataFrame + Features df containing required columns: 'close', 'volume', volatility features + + Returns: + -------- + pd.Series + Enhanced target variable with cross-day targets removed + """ + + future_bars= self.config.forward_bars + + future_ma_fast = df['close'].shift(-future_bars).rolling(5).mean() + + # Calculate forward returns (original approach) + forward_returns = df['close'].shift(-future_bars) / df['close'] - 1 + + target = forward_returns + + # 6. Advanced noise reduction + # Use exponential moving standard deviation for dynamic thresholds + target_std = target.ewm(span=50, min_periods=20).std() + + # Adaptive thresholds based on rolling standard deviation + upper_clip = 2.5 * target_std + lower_clip = -2.5 * target_std + + # Apply soft clipping using hyperbolic tangent + target = target_std * np.tanh(target / target_std) + + # Final hard clips for extreme outliers + target = target.clip(lower=lower_clip, upper=upper_clip) + + + # 7. Remove cross-day targets and intraday seasonality + target = self.remove_crossday_targets(target, df, future_bars) + + #only 10% of extreme values from both sides are kept + #target = target.where((target > target.quantile(0.9)) | (target < target.quantile(0.1)), 0) + + print("after target generation", target.index[[0, -1]]) + + return target +``` \ No newline at end of file diff --git a/image-1.png b/image-1.png new file mode 100644 index 0000000..a730f81 Binary files /dev/null and b/image-1.png differ diff --git a/ml-snippets.md b/ml-snippets.md new file mode 100644 index 0000000..340dcd2 --- /dev/null +++ b/ml-snippets.md @@ -0,0 +1,317 @@ +- [Features](#features) + - [Features analysis](#features-analysis) + - [Target to classes](#target-to-classes) + - [Features importance](#features-importance) + - [Features selection](#features-selection) +- [Prediction](#prediction) + - [evaluation](#evaluation) + - [calculated returns based on various probability prediction thresholda](#calculated-returns-based-on-various-probability-prediction-thresholda) + - [cumulative returns bases od prob predictions](#cumulative-returns-bases-od-prob-predictions) + - [charts](#charts) + + + + + +# Features + +## Features analysis + +```python +# Calculate different percentiles +percentiles = [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99] +print("\nPercentiles:") +for p in percentiles: + print(f"{p}th percentile: {df['target'].quantile(p/100):.6f}") + +# Plot distribution +plt.figure(figsize=(15, 10)) + +# Plot 1: Overall distribution +plt.subplot(2, 2, 1) +sns.histplot(df['target'], bins=100) +plt.title('Distribution of Returns') +plt.axvline(x=0, color='r', linestyle='--', alpha=0.5) + +# Plot 2: Distribution with potential thresholds +plt.subplot(2, 2, 2) +sns.histplot(df['target'], bins=100) +plt.title('Distribution with Potential Thresholds') + +# Add lines for different standard deviations +std = df['target'].std() +mean = df['target'].mean() +for i in [0.5, 1.0, 1.5]: + plt.axvline(x=mean + i*std, color='g', linestyle='--', alpha=0.3, label=f'+{i} std') + plt.axvline(x=mean - i*std, color='r', linestyle='--', alpha=0.3, label=f'-{i} std') +plt.legend() + +# Let's try different threshold approaches +# Approach 1: Standard deviation based +std_multiplier = 0.2 +std_threshold = std_multiplier * std +labels_std = np.where(df['target'] > std_threshold, 1, + np.where(df['target'] < -std_threshold, -1, 0)) + +# Approach 2: Percentile based +percentile_threshold = 0.2 # top/bottom 20% +top_threshold = df['target'].quantile(1 - percentile_threshold) +bottom_threshold = df['target'].quantile(percentile_threshold) +labels_percentile = np.where(df['target'] > top_threshold, 1, + np.where(df['target'] < bottom_threshold, -1, 0)) + +# Plot 3: Distribution of STD-based classes +plt.subplot(2, 2, 3) +sns.histplot(data=pd.DataFrame({'return': df['target'], 'class': labels_std}), + x='return', hue='class', bins=100) +plt.title(f'Classes Based on {std_multiplier} Standard Deviation') +plt.axvline(x=std_threshold, color='g', linestyle='--', alpha=0.5) +plt.axvline(x=-std_threshold, color='r', linestyle='--', alpha=0.5) + +# Plot 4: Distribution of Percentile-based classes +plt.subplot(2, 2, 4) +sns.histplot(data=pd.DataFrame({'return': df['target'], 'class': labels_percentile}), + x='return', hue='class', bins=100) +plt.title(f'Classes Based on {percentile_threshold*100}th Percentiles') +plt.axvline(x=top_threshold, color='g', linestyle='--', alpha=0.5) +plt.axvline(x=bottom_threshold, color='r', linestyle='--', alpha=0.5) + +plt.tight_layout() +plt.show() + +# Print class distributions +print("\nClass Distribution (STD-based):") +print(pd.Series(labels_std).value_counts(normalize=True)) + +print("\nClass Distribution (Percentile-based):") +print(pd.Series(labels_percentile).value_counts(normalize=True)) + +# Calculate mean return for each class +print("\nMean Return by Class (STD-based):") +std_df = pd.DataFrame({'return': df['target'], 'class': labels_std}) +print(std_df.groupby('class')['return'].mean()) + +print("\nMean Return by Class (Percentile-based):") +perc_df = pd.DataFrame({'return': df['target'], 'class': labels_percentile}) +print(perc_df.groupby('class')['return'].mean()) +``` +Target distributions + +### Target to classes + +Based on std dev + +```python +# Read and prepare the data +df = pd.read_csv('model_data.csv') +df = df.drop('ts_event', axis=1) + +# Separate features and target +X = df.drop('target', axis=1) +y = df['target'] + +# Split the data first so we only use train data statistics for thresholds +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Calculate threshold based on training data only +train_std = y_train.std() +threshold = 0.2 * train_std + +# Transform targets into classes (update this function) instead of -1,0,1 do 0,1,2 +def create_labels(y, threshold): + return np.where(y > threshold, 2, + np.where(y < -threshold, 0, 1)) + +y_train_classes = create_labels(y_train, threshold) +y_test_classes = create_labels(y_test, threshold) + +# Print class distribution +print("Training Class Distribution:") +print(pd.Series(y_train_classes).value_counts(normalize=True)) +print("\nTest Class Distribution:") +print(pd.Series(y_test_classes).value_counts(normalize=True)) +``` + +based on percentile/threshold + +## Features importance + +```python +#XGB top 20 feature importance +feature_importance = pd.DataFrame({ + 'feature': X.columns, + 'importance': xgb_model.feature_importances_ +}) +feature_importance = feature_importance.sort_values('importance', ascending=False).head(20) + +plt.figure(figsize=(12, 6)) +sns.barplot(x='importance', y='feature', data=feature_importance) +plt.title('Top 20 Most Important Features') +plt.xlabel('Feature Importance') +plt.tight_layout() +plt.show() + +``` + +## Features selection + +# Prediction + +## evaluation + +```python +# Calculate directional accuracy +directional_accuracy = (np.sign(y_pred) == np.sign(y_test)).mean() +print(f"Directional Accuracy: {directional_accuracy:.4f}") + +#confusion matrix +from sklearn.metrics import confusion_matrix +# Plot confusion matrix +plt.figure(figsize=(10, 8)) +cm = confusion_matrix(y_test_classes, y_pred) +sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') +plt.title('Confusion Matrix') +plt.ylabel('True Label') +plt.xlabel('Predicted Label') +plt.show() +``` +### calculated returns based on various probability prediction thresholda + +```python +# .predict_proba() gives the probabilities for each class +print("Predicted probabilities:", model.predict_proba(X_test)) +# Output example: +# [ +# [0.35, 0.65], # 35% not spam, 65% spam +# [0.70, 0.30], # 70% not spam, 30% spam +# [0.45, 0.55], # 45% not spam, 55% spam +# ] +``` + +Chart probabilities + +```python +# Predict probabilities for each class +probabilities = model.predict_proba(X_test) # Shape: (n_samples, n_classes) +results_df = pd.DataFrame({ + 'Date': dates_test, + 'Short Probability': probabilities[:, 0], # Probability of class 0 (short) + 'Neutral Probability': probabilities[:, 1], # Probability of class 1 (neutral) + 'Long Probability': probabilities[:, 2] # Probability of class 2 (long) +}).sort_values(by='Date') # Sort by date for time series plotting + +fig = go.Figure() + +# Add lines for each class probability +fig.add_trace(go.Scatter( + x=results_df['Date'], y=results_df['Short Probability'], + mode='lines', name='Short (Class 0)', line=dict(color='red') +)) + +fig.add_trace(go.Scatter( + x=results_df['Date'], y=results_df['Neutral Probability'], + mode='lines', name='Neutral (Class 1)', line=dict(color='orange') +)) + +fig.add_trace(go.Scatter( + x=results_df['Date'], y=results_df['Long Probability'], + mode='lines', name='Long (Class 2)', line=dict(color='green') +)) + +# Add title and labels +fig.update_layout( + title="Time Series of Predicted Class Probabilities", + xaxis_title="Date", + yaxis_title="Probability", + legend_title="Class" +) + +fig.show() + +``` + +### cumulative returns bases od prob predictions +```python +# Calculate returns based on probablity predictions +def calculate_returns(predictions, actual_returns, confidence_threshold=0.0): + pred_probs = final_model.predict_proba(X_test_selected) + max_probs = np.max(pred_probs, axis=1) + + # Only take positions when confidence exceeds threshold + positions = np.zeros_like(predictions, dtype=float) + confident_mask = max_probs > confidence_threshold + + # Convert predictions 0->-1, 2->1 for returns calculation + adj_predictions = np.where(predictions == 2, 1, np.where(predictions == 0, -1, 0)) + positions[confident_mask] = adj_predictions[confident_mask] + + returns = positions * actual_returns + return returns, np.mean(confident_mask) + +# Test different confidence thresholds +confidence_thresholds = [0.4, 0.5, 0.6, 0.7, 0.8] +results = [] + +for conf_threshold in confidence_thresholds: + returns, coverage = calculate_returns(y_pred, y_test.values, conf_threshold) + + # Calculate metrics + sharpe = np.sqrt(252) * returns.mean() / returns.std() + accuracy = accuracy_score(y_test_classes[returns != 0], + y_pred[returns != 0]) + + results.append({ + 'confidence_threshold': conf_threshold, + 'sharpe': sharpe, + 'accuracy': accuracy, + 'coverage': coverage + }) + +##Plot difference confidence threshodls +# Plot cumulative returns +plt.figure(figsize=(12, 6)) +for th in confidence_thresholds: + returns, _ = calculate_returns(y_pred, y_test.values, th) # Using 0.6 confidence threshold + cumulative_returns = (1 + returns).cumprod() + plt.plot(cumulative_returns) +plt.title('Cumulative Returns (0.6 confidence threshold)') +plt.xlabel('Trade Number') +plt.ylabel('Cumulative Return') +plt.grid(True) +plt.show() + + +results_df = pd.DataFrame(results) +print("\nPerformance at different confidence thresholds:") +print(results_df) + +# Plot feature importance +importance_df = pd.DataFrame({ + 'feature': selected_features, + 'importance': final_model.feature_importances_ +}) +importance_df = importance_df.sort_values('importance', ascending=False) + +plt.figure(figsize=(12, 6)) +sns.barplot(x='importance', y='feature', data=importance_df) +plt.title('Feature Importance') +plt.xlabel('Importance') +plt.tight_layout() +plt.show() + +``` + + +## charts + +```python +# Actual vs predicted values +plt.figure(figsize=(10, 6)) +plt.scatter(y_test, y_pred, alpha=0.5) +plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2) +plt.xlabel('Actual Returns') +plt.ylabel('Predicted Returns') +plt.title('Actual vs Predicted Returns') +plt.tight_layout() +plt.show() +``` \ No newline at end of file diff --git a/vbt-snippets.md b/vbt-snippets.md index 916f446..49812a5 100644 --- a/vbt-snippets.md +++ b/vbt-snippets.md @@ -132,7 +132,35 @@ basic_data = vbt.Data.from_data(vbt.symbol_dict({"BAC": ohlcv_df}), tz_convert=z basic_data.wrapper.index.normalize().nunique() #numdays #Fetching Trades and Aggregating custom OHLCV -TBD +from ttools import load_data +#This is how to call LOAD function +symbol = ["SPY", "BAC"] +#datetime in zoneNY +day_start = datetime(2024, 1, 15, 9, 30, 0) +day_stop = datetime(2024, 10, 20, 16, 0, 0) +day_start = zoneNY.localize(day_start) +day_stop = zoneNY.localize(day_stop) + +#requested AGG +resolution = 1 #12s bars +agg_type = AggType.OHLCV #other types AggType.OHLCV_VOL, AggType.OHLCV_DOL, AggType.OHLCV_RENKO +exclude_conditions = ['C','O','4','B','7','V','P','W','U','Z','F','9','M','6'] #None to defaults +minsize = 100 #min trade size to include +main_session_only = False +force_remote = False + +data = load_data(symbol = symbol, + agg_type = agg_type, + resolution = resolution, + start_date = day_start, + end_date = day_stop, + #exclude_conditions = None, + minsize = minsize, + main_session_only = main_session_only, + force_remote = force_remote, + return_vbt = True, #returns vbt object + verbose = True + ) ``` ## REINDEX to main session @@ -266,7 +294,8 @@ _feature_config: tp.ClassVar[Config] = HybridConfig( basic_data._feature_config = _feature_config ``` -ddd + +```python #1s to 1T t1data = basic_data[['open', 'high', 'low', 'close', 'volume','vwap','buyvolume','trades','sellvolume']].resample("1T") t1data = t1data.transform(lambda df: df.between_time('09:30', '16:00').dropna()) @@ -275,6 +304,7 @@ t1data = t1data.transform(lambda df: df.between_time('09:30', '16:00').dropna()) resampler_s = vbt.Resampler(target_data.index, source_data.index, source_freq="1T", target_freq="1s") basic_data.resample(resampler_s) +``` # REALIGN @@ -1383,6 +1413,8 @@ pf_stats.sort_values(by='Sharpe Ratio', ascending=False).iloc[::-1].vbt.heatmap( # UTILS ```python +#use plotly resampler +vbt.settings.plotting["use_resampler"] = True #RELOAD module in ipynb %load_ext autoreload