v2realbot/v2realbot/tools/loadbatch.py

import matplotlib
import matplotlib.dates as mdates
#matplotlib.use('Agg')  # Set the Matplotlib backend to 'Agg'
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
from typing import List
from enum import Enum
import numpy as np
import v2realbot.controller.services as cs
from rich import print as richprint
from v2realbot.common.model import AnalyzerInputs
from v2realbot.common.model import TradeDirection, TradeStatus, Trade, TradeStoplossType
from v2realbot.utils.utils import isrising, isfalling,zoneNY, price2dec, safe_get#, print
from pathlib import Path
from v2realbot.config import WEB_API_KEY, DATA_DIR, MEDIA_DIRECTORY
from v2realbot.enums.enums import RecordType, StartBarAlign, Mode, Account, OrderSide
from io import BytesIO
from v2realbot.utils.historicals import get_historical_bars
from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
from collections import defaultdict
from scipy.stats import zscore
from io import BytesIO
from typing import Tuple, Optional, List
from v2realbot.common.model import TradeDirection, TradeStatus, Trade, TradeStoplossType
from collections import Counter
import vectorbtpro as vbt

    # Function to add 23 seconds to the last datetime (if it exists and is the same day)
def adjust_datetime_iteratively(df, resolution):
    adjusted_times = []
    for i, current_time in enumerate(df.index):
        if i == 0:
            # The first entry is unchanged
            adjusted_times.append(current_time)
            continue

        previous_time = adjusted_times[-1]
        # Check if it's the same day
        if previous_time.date() == current_time.date():
            # Add resolution to the previous datetime
            adjusted_time = previous_time + pd.Timedelta(seconds=resolution)
        else:
            # Different day, leave it as is
            adjusted_time = current_time

        adjusted_times.append(adjusted_time)

    # Update DataFrame index
    df.index = pd.DatetimeIndex(adjusted_times)
    return df

def convert_to_dataframe(ohlcv):
    """
    Convert a dictionary containing OHLCV data into a pandas DataFrame.

    Parameters:
        ohlcv (dict): Dictionary containing OHLCV data.
                      It should have keys 'time', 'open', 'high', 'low', 'close', 'volume', 'updated'.
                      'time' should be a list of float timestamps.
                      'updated' should be a list of Python datetimes in UTC time zone.

    Returns:
        pd.DataFrame: DataFrame containing the OHLCV data with the index converted to East coast US time.
    """

    #pokud existuje key index, tak menime na custom_index, aby nedelal neplechu v pd
    try:
        if ohlcv.get('index', False):
            ohlcv['custom_index'] = ohlcv.pop('index')
    except Exception as e:
        pass

    #keys that should not go uppercase letter first
    keys_not_to_upper = ["time", "updated"]

    # Update keys not in the exclusion list
    for key in list(ohlcv.keys()):  # Iterate over a copy of the keys
        if key not in keys_not_to_upper:
            ohlcv[key.title()] = ohlcv.pop(key)

    # Create DataFrame from the dictionary
    df = pd.DataFrame(ohlcv)

    # Convert 'time' to datetime and set as index
    df['time'] = pd.to_datetime(df['time'], unit='s', utc=True)
    df.set_index('time', inplace=True)
    # Convert index to East coast US time zone
    df.index = df.index.tz_convert('US/Eastern')
    if 'updated' in df.columns:
        df['updated'] = pd.to_datetime(df['updated'], unit='s', utc=True)
        df['updated'] = df['updated'].dt.tz_convert('US/Eastern')

    return df

def print(v, *args, **kwargs):
    if v:
        richprint(*args, **kwargs)

def load_batch(runner_ids: List = None, batch_id: str = None, space_resolution_evenly = False, main_session_only = True, merge_ind2bars = True, bars_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Vwap'], indicators_columns = [], verbose = False) -> Tuple[int, dict]:
    """Load batches (all runners from single batch) into pandas dataframes

    Args:
        runner_ids (List, optional): A list of runner identifiers (e.g., stock tickers). Defaults to None.
        batch_id (str, optional): The ID of a specific batch to retrieve. Defaults to None.
        merge_ind2bars (bool, optional): merge indicator into bars dataframe. Defaults to True.
        bars_columns (list, optional):  List of columns to keep in bars df. Defaults to ['Open', 'High', 'Low', 'Close', 'Volume', 'Vwap'].
        indicators_columns (list, optional): List of columns to keep in indicators df. Defaults to an empty list.
        space_resoution_evenly: If True then, it alters index so it is spaced evenly in given resolution in ['resooution']
    Returns:
        Tuple[int, dict]: A tuple containing:
            * An integer potentially representing a status code or data count.
            * A dictionary with keys bars, indicators and cbar_indicators - with pandas dataframe
    """

    if runner_ids is None and batch_id is None:
        return -2, f"runner_id or batch_id must be present", 0

    if batch_id is not None:
        res, runner_ids =cs.get_archived_runnerslist_byBatchID(batch_id)

        if res != 0:
            print(f"no batch {batch_id} found")
            return -1, f"no batch {batch_id} found", 0

    #DATA PREPARATION
    bars = None
    indicators = None
    cnt = 0
    dfs = dict(bars=[], indicators=[],cbar_indicators=[])
    resolution = None
    for id in runner_ids:
        cnt += 1
        #get runner detail
        res, sada =cs.get_archived_runner_details_byID(id)
        if res != 0:
            print(f"no runner {id} found")
            return -1, f"no runner {id} found", 0

        if resolution is None:
            resolution  = sada["bars"]["resolution"][0]
            print(verbose, f"Resolution : {resolution}")

        #add daily bars limited to required columns, we keep updated as its mapping column to indicators
        bars = convert_to_dataframe(sada["bars"])[bars_columns + ["updated"]]
        #bars = bars.loc[:, bars_columns]

        indicators = convert_to_dataframe(sada["indicators"][0])[indicators_columns]

        #join indicators to bars dataframe
        if merge_ind2bars:
            #merge, time v indicators odpovida udpated v bars
            bars = bars.reset_index()
            bars = pd.merge(bars, indicators, left_on="updated", right_on="time", how="left")
            bars = bars.set_index("time")
        else:
            dfs["indicators"].append(indicators)

        #drop updated as mapping column
        #bars = bars.drop("updated", axis=1)
        dfs["bars"].append(bars)

        #indicators = sada["indicators"][0]
        #cbar_indicators = sada["indicators"][1]
    #merge all days into single df
    for key in dfs:
        if len(dfs[key])>0:
            concat_df = pd.concat(dfs[key], axis=0)
            concat_df = concat_df.between_time('9:30', '16:00') if main_session_only else concat_df

            # Count the number of duplicates (excluding the first occurrence)
            num_duplicates = concat_df.index.duplicated().sum()

            if num_duplicates > 0:
                print(verbose, f"NOTE: DUPLICATES {num_duplicates}/{len(concat_df)} in {key}. REMOVING.")
                concat_df = concat_df[~concat_df.index.duplicated()]

                num_duplicates = concat_df.index.duplicated().sum()
                print(verbose, f"Now there are {num_duplicates}/{len(concat_df)}")

            if space_resolution_evenly and key != "cbar_indicators":
                # Apply rounding to the datetime index according to resolution (in seconds)
                concat_df = adjust_datetime_iteratively(concat_df, resolution)

            dfs[key] = concat_df
    return 0, dfs

if __name__ == "__main__":
    res, df = load_batch(batch_id="e44a5075", space_resolution_evenly=True, indicators_columns=["Rsi14"], main_session_only=False)
    if res < 0:
        print("Error" + str(res) + str(df))
    print(df)
    df = df["bars"]
    print(df.info(), df.head())
    #filter columns
    #columns_to_keep = ['Open', 'High', 'Low', 'Close', 'Volume', 'Vwap']
    #df = df.loc[:, columns_to_keep]
    #df = df.rename(columns={'index': 'custom_index'})
    print(df.info(), df.head(), df.describe())
    #filter times
    #df = df.between_time('9:30', '16:00')
    print(df.info())
    # Set the frequency to 23 seconds
    #df.index.freq = pd.tseries.offsets.Second(23)
    # Check the frequency of the index

    # Resample and aggregate the data
    # resampled_df = df.resample('23S').agg({
    #         'open': 'first',
    #         'high': 'max',
    #         'low': 'min',
    #         'close': 'last',
    #         'volume': 'sum'
    #     })

    #df.index.freq = pd.infer_freq(df.index)
    #print(df.index.freq)


    # Set the frequency of the index explicitly - if it exists like 1T etc, if doesnt exists then custom_frequency will be used
    #df.index.freq = pd.date_range(start=df.index[0], periods=len(df), freq='23S')

    print(df.info())

    vbt.settings.set_theme("dark")
    vbt.settings['plotting']['layout']['width'] = 1280
    vbt.settings.plotting.auto_rangebreaks = True

    #naloadujeme do vbt symbol as column
    bar_data = vbt.Data.from_data({"BAC": df}, tz_convert="US/Eastern")
    print(bar_data)
    print(bar_data.close)

    print(bar_data.data["BAC"]["Rsi14"])
    bar_data.data["BAC"]["Rsi14"].vbt.plot().show()
    print(bar_data["Rsi14"])


    #ohlcv plot (sublot 2x1)
    bar_data.data["BAC"].vbt.ohlcv.plot().show()

    #create two subplots 3x1 (ohlcv + RSI)
    # fig = vbt.make_subplots(rows=3, cols=1)
    # bar_data.data["BAC"].vbt.ohlcv.plot(add_trace_kwargs=dict(row=1, col=1),fig=fig)
    # bar_data.data["BAC"]["Rsi14"].vbt.plot(add_trace_kwargs=dict(row=3, col=1),fig=fig)
    # fig.show()

    #create subplots with alternate Y axis - RSI overlay
    fig1 = vbt.make_subplots(specs=[[{"secondary_y": True}]])
    bar_data.data["BAC"]["Close"].vbt.plot(add_trace_kwargs=dict(secondary_y=False),fig=fig1)
    bar_data.data["BAC"].vbt.plot(add_trace_kwargs=dict(secondary_y=True),fig=fig1)
    fig1.show()

    puv_df = bar_data.data["BAC"]

    bar_data23s = bar_data[["Open", "High", "Low", "Close", "Volume"]]
    print(bar_data23s)
    #resample by vbt
    bar_data46s = bar_data23s.get().resample("46s").agg({
        "Open": "first",
        "High": "max",
        "Low": "min",
        "Close": "last",
        "Volume": "sum"
    })

    print(bar_data46s)
    res_data = bar_data46s.data["BAC"]
    #bar_data23s.data["BAC"].ptable()
    #bar_data23s = bar_data.resample("23S")
    print(bar_data46s)
    print(bar_data46s.close)
    vbt.settings.plotting.auto_rangebreaks = True
    bar_data46s.data["BAC"].vbt.ohlcv.plot().show()

    #TARGET DAYS - only one day or range
    # Target Date
    #target_date = pd.to_datetime('2023-10-12', tz='US/Eastern')

    # Date Range
    start_date = pd.to_datetime('2024-03-12')
    #end_date = pd.to_datetime('2023-10-14')

    new_data = bar_data.transform(lambda df: df[df.index.date == start_date.date()])
    #range filtered_data = data[(data.index >= start_date) & (data.index <= end_date)

    print(new_data)
    new_data.data["BAC"].vbt.ohlcv.plot().show()


    # Filtering RANGE or DAY
    # filtered_data = data[(data.index >= start_date) & (data.index <= end_date)]g
    # filtered_data = data[data.index.date == target_date.date()]


    #custom aggregagation
    # ohlcv_agg = pd.DataFrame({
    #     'Open': df.resample('1T')['Open'].first(),
    #     'High': df.resample('1T')['High'].max(),
    #     'Low': df.resample('1T')['Low'].min(),
    #     'Close': df.resample('1T')['Close'].last(),
    #     'Volume': df.resample('1T')['Volume'].sum()
    # })

    #Define a custom frequency with a timedelta of 23 seconds
    # custom_frequency = pd.tseries.offsets.DateOffset(seconds=23)

    # # Create a new DataFrame with the desired frequency
    # new_index = pd.date_range(start=df.index[0], end=df.index[-1], freq=custom_frequency)
    # new_df = pd.DataFrame(index=new_index)

    # # Reindex the DataFrame
    # df = df.reindex(new_df.index)

    # # Now you can check the frequency of the index
    # print(df.index.freq)