Files
snippets/features_targets.md
David Brazda be7de0ef19 update
2025-07-31 14:01:52 +02:00

9.5 KiB

Here goes the target features

[!NOTE] Poznámka Contents

[!note]- poznám ka neco neco

[!example]- Graph: voaltility averagae slope !Pasted image 20250630140635.png

[!example]- Graph: volatility average slope across 1d to 30d range of windows !Volatility_average_slope.png

Things to try

TODO:

Features:

Target:

  • maybe add manual labeling

Features


    def prepare_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, list]:
        """Prepare enhanced features from input df with focus on predictive potential"""
        features = pd.DataFrame(index=df.index)

        # Original ohlcv added to features
        features['close'] = df['close']
        features['volume'] = df['volume']
        features['trades_count'] = df['trades']
        features['buy_volume'] = df['buyvolume']
        features['sell_volume'] = df['sellvolume']
        features['high'] = df['high']
        features['low'] = df['low']
        # features['log_return'] = np.log(features['close'] / features['close'].shift(1))
        # features['returns_1'] = features['close'].pct_change()
        # features['returns_5'] = features['close'].pct_change(5)
        # features['returns_20'] = features['close'].pct_change(20)

        def get_fib_windows():
            """
            #TODO based on real time (originally for 1s bars)

            Generate Fibonacci sequence windows up to ~1 hour (3600 seconds)
            Returns sequence: 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584
            """
            fib_windows = [3, 5]
            while fib_windows[-1] < 3600/60:
                next_fib = fib_windows[-1] + fib_windows[-2]
                if next_fib > 3600/60:
                    break
                fib_windows.append(next_fib)
            return fib_windows

        fib_windows = get_fib_windows()
        
        # Base price and returns
        features['log_return'] = np.log(features['close'] / features['close'].shift(1))
        features['price_velocity'] = (features['close'] - features['close'].shift(1)) / 1.0  # per second
        features['price_acceleration'] = features['price_velocity'] - features['price_velocity'].shift(1)
        
        # Fibonacci-based features
        for window in fib_windows:
            # Price features
            features[f'log_return_{window}s'] = np.log(features['close'] / features['close'].shift(window))
            features[f'volatility_{window}s'] = features['log_return'].rolling(window).std()
            features[f'range_{window}s'] = (features['high'].rolling(window).max() - 
                                        features['low'].rolling(window).min()) / features['close']
            
            # Volume features
            features[f'volume_momentum_{window}s'] = (
                features['volume'].rolling(window).mean() / 
                features['volume'].rolling(window * 2).mean()
            )
            
            features[f'buy_volume_momentum_{window}s'] = (
                features['buy_volume'].rolling(window).mean() / 
                features['buy_volume'].rolling(window * 2).mean()
            )
            
            features[f'sell_volume_momentum_{window}s'] = (
                features['sell_volume'].rolling(window).mean() / 
                features['sell_volume'].rolling(window * 2).mean()
            )
            
            # Trade features
            features[f'trade_intensity_{window}s'] = (
                features['trades_count'].rolling(window).mean() / 
                features['trades_count'].rolling(window * 2).mean()
            )
            
            features[f'avg_trade_size_{window}s'] = (
                features['volume'].rolling(window).sum() / 
                features['trades_count'].rolling(window).sum()
            )
            
            # Order flow features
            features[f'cum_volume_delta_{window}s'] = (
                features['buy_volume'] - features['sell_volume']
            ).rolling(window).sum()
            
            features[f'volume_pressure_{window}s'] = (
                features['buy_volume'].rolling(window).sum() / 
                features['sell_volume'].rolling(window).sum()
            )
            
            # Price efficiency
            features[f'price_efficiency_{window}s'] = (
                np.abs(features['close'] - features['close'].shift(window)) /
                (features['high'].rolling(window).max() - features['low'].rolling(window).min())
            )
            
            # Moving averages and their crosses
            features[f'sma_{window}s'] = features['close'].rolling(window).mean()
            if window > 5:  # Create MA crosses with shorter timeframe
                features[f'ma_cross_5_{window}s'] = (
                    features['close'].rolling(5).mean() - 
                    features['close'].rolling(window).mean()
                )
        
        # MA-based features
        ma_lengths = [5, 10, 20, 50]
        for length in ma_lengths:
            # Regular MAs
            features[f'ma_{length}'] = features['close'].rolling(length).mean()
            
            # MA slopes (rate of change)
            features[f'ma_{length}_slope'] = features[f'ma_{length}'].pct_change(3)
            
            # Price distance from MA
            features[f'price_ma_{length}_dist'] = (features['close'] - features[f'ma_{length}']) / features[f'ma_{length}']
            
            # MA crossovers
            if length > 5:
                features[f'ma_5_{length}_cross'] = (features['ma_5'] - features[f'ma_{length}']) / features[f'ma_{length}']
        
        # MA convergence/divergence
        features['ma_convergence'] = ((features['ma_5'] - features['ma_20']).abs() / 
                                    features['ma_20'].rolling(10).mean())
        
        # Volatility features using MAs
        features['ma_volatility'] = features['ma_5'].rolling(10).std() / features['ma_20']
        
        # MA momentum
        features['ma_momentum'] = (features['ma_5'] / features['ma_5'].shift(5) - 1) * 100
          
        
        # Cleanup and feature selection
        features = features.replace([np.inf, -np.inf], np.nan)
        
        lookback = 1000
        if len(features) > lookback:
            rolling_corr = features.iloc[-lookback:].corr().abs()
            upper = rolling_corr.where(np.triu(np.ones(rolling_corr.shape), k=1).astype(bool))
            to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
            print(f"Column highly correlated - maybe drop? {to_drop} ")
            #features = features.drop(columns=to_drop)
        
        feature_columns = list(features.columns)
        print(f"Features shape before dropna: {features.shape}")
        
        return features.dropna(), feature_columns

Targets

Unbalanced classes

from xgboost import XGBClassifier

# Compute scale_pos_weight
n_0 = sum(y_train == 0)
n_1 = sum(y_train == 1)
scale_pos_weight = n_0 / n_1

model = XGBClassifier(scale_pos_weight=scale_pos_weight, ...)
    def create_target_regressor(self, df: pd.DataFrame) -> pd.Series:
        """
        https://claude.ai/chat/8e7fe81c-ddbe-4e64-9af0-2bc4764fc5f0

        Creates enhanced target variable using adaptive returns based on market conditions.
        Key improvements:
        1. Multi-timeframe momentum approach
        2. Volume-volatility regime adaptation
        3. Trend-following vs mean-reversion regime detection
        4. Noise reduction through sophisticated filtering
        
        Parameters:
        -----------
        df : pd.DataFrame
            Features df containing required columns: 'close', 'volume', volatility features
        
        Returns:
        --------
        pd.Series
            Enhanced target variable with cross-day targets removed
        """

        future_bars= self.config.forward_bars

        future_ma_fast = df['close'].shift(-future_bars).rolling(5).mean()
        
        # Calculate forward returns (original approach)
        forward_returns = df['close'].shift(-future_bars) / df['close'] - 1
        
        target =  forward_returns

       # 6. Advanced noise reduction
        # Use exponential moving standard deviation for dynamic thresholds
        target_std = target.ewm(span=50, min_periods=20).std()
        
        # Adaptive thresholds based on rolling standard deviation
        upper_clip = 2.5 * target_std
        lower_clip = -2.5 * target_std
        
        # Apply soft clipping using hyperbolic tangent
        target = target_std * np.tanh(target / target_std)
        
        # Final hard clips for extreme outliers
        target = target.clip(lower=lower_clip, upper=upper_clip)


        # 7. Remove cross-day targets and intraday seasonality
        target = self.remove_crossday_targets(target, df, future_bars)

        #only 10% of extreme values from both sides are kept
        #target = target.where((target > target.quantile(0.9)) | (target < target.quantile(0.1)), 0)

        print("after target generation", target.index[[0, -1]])
        
        return target