Files
strategy-lab/to_explore/notebooks/CrossValidation.ipynb

81 KiB

Cross-validation

In [ ]:
from vectorbtpro import *
# whats_imported()

vbt.settings.set_theme("dark")
In [ ]:
data = vbt.BinanceData.pull("BTCUSDT", end="2022-11-01 UTC")
data.index
In [ ]:
@vbt.parameterized(merge_func="concat")
def sma_crossover_perf(data, fast_window, slow_window):
    fast_sma = data.run("sma", fast_window, short_name="fast_sma")
    slow_sma = data.run("sma", slow_window, short_name="slow_sma")
    entries = fast_sma.real_crossed_above(slow_sma)
    exits = fast_sma.real_crossed_below(slow_sma)
    pf = vbt.Portfolio.from_signals(
        data, entries, exits, direction="both")
    return pf.sharpe_ratio
In [ ]:
perf = sma_crossover_perf(
    data["2020":"2020"],
    vbt.Param(np.arange(5, 50), condition="x < slow_window"),
    vbt.Param(np.arange(5, 50)),
    _execute_kwargs=dict(
        clear_cache=50,
        collect_garbage=50
    )
)
perf
In [ ]:
perf.sort_values(ascending=False)
In [ ]:
best_fast_window, best_slow_window = perf.idxmax()
sma_crossover_perf(
    data["2021":"2021"],
    best_fast_window,
    best_slow_window
)
In [ ]:
data["2021":"2021"].run("from_holding").sharpe_ratio
In [ ]:
start_index = data.index[0]
period = pd.Timedelta(days=180)
all_is_bounds = {}
all_is_bl_perf = {}
all_is_perf = {}
all_oos_bounds = {}
all_oos_bl_perf = {}
all_oos_perf = {}
split_idx = 0
period_idx = 0

with vbt.ProgressBar() as pbar:
    while start_index + 2 * period <= data.index[-1]:
        pbar.set_prefix(str(start_index))

        is_start_index = start_index
        is_end_index = start_index + period - pd.Timedelta(nanoseconds=1)
        is_data = data[is_start_index : is_end_index]
        is_bl_perf = is_data.run("from_holding").sharpe_ratio
        is_perf = sma_crossover_perf(
            is_data,
            vbt.Param(np.arange(5, 50), condition="x < slow_window"),
            vbt.Param(np.arange(5, 50)),
            _execute_kwargs=dict(
                clear_cache=50,
                collect_garbage=50
            )
        )

        oos_start_index = start_index + period
        oos_end_index = start_index + 2 * period - pd.Timedelta(nanoseconds=1)
        oos_data = data[oos_start_index : oos_end_index]
        oos_bl_perf = oos_data.run("from_holding").sharpe_ratio
        best_fw, best_sw = is_perf.idxmax()
        oos_perf = sma_crossover_perf(oos_data, best_fw, best_sw)
        oos_perf_index = is_perf.index[is_perf.index == (best_fw, best_sw)]
        oos_perf = pd.Series([oos_perf], index=oos_perf_index)

        all_is_bounds[period_idx] = (is_start_index, is_end_index)
        all_oos_bounds[period_idx + 1] = (oos_start_index, oos_end_index)
        all_is_bl_perf[(split_idx, period_idx)] = is_bl_perf
        all_oos_bl_perf[(split_idx, period_idx + 1)] = oos_bl_perf
        all_is_perf[(split_idx, period_idx)] = is_perf
        all_oos_perf[(split_idx, period_idx + 1)] = oos_perf
        start_index = start_index + period
        split_idx += 1
        period_idx += 1
        pbar.update()
In [ ]:
is_period_ranges = pd.DataFrame.from_dict(
    all_is_bounds, 
    orient="index",
    columns=["start", "end"]
)
is_period_ranges.index.name = "period"
oos_period_ranges = pd.DataFrame.from_dict(
    all_oos_bounds, 
    orient="index",
    columns=["start", "end"]
)
oos_period_ranges.index.name = "period"
period_ranges = pd.concat((is_period_ranges, oos_period_ranges))
period_ranges = period_ranges.drop_duplicates()
print(period_ranges)
In [ ]:
is_bl_perf = pd.Series(all_is_bl_perf)
is_bl_perf.index.names = ["split", "period"]
oos_bl_perf = pd.Series(all_oos_bl_perf)
oos_bl_perf.index.names = ["split", "period"]
bl_perf = pd.concat((
    is_bl_perf.vbt.select_levels("period"), 
    oos_bl_perf.vbt.select_levels("period")
))
bl_perf = bl_perf.drop_duplicates()
bl_perf
In [ ]:
is_perf = pd.concat(all_is_perf, names=["split", "period"])
is_perf
In [ ]:
oos_perf = pd.concat(all_oos_perf, names=["split", "period"])
oos_perf
In [ ]:
is_best_mask = is_perf.index.vbt.drop_levels("period").isin(
    oos_perf.index.vbt.drop_levels("period"))
is_best_perf = is_perf[is_best_mask]
is_best_perf
In [ ]:
print(pd.concat((
    is_perf.describe(),
    is_best_perf.describe(),
    is_bl_perf.describe(),
    oos_perf.describe(),
    oos_bl_perf.describe()
), axis=1, keys=[
    "IS", 
    "IS (Best)", 
    "IS (Baseline)", 
    "OOS (Test)", 
    "OOS (Baseline)"
]))
In [ ]:
fig = is_perf.vbt.boxplot(
    by_level="period",
    trace_kwargs=dict(
        line=dict(color="lightskyblue"), 
        opacity=0.4,
        showlegend=False
    ),
    xaxis_title="Period", 
    yaxis_title="Sharpe",
)
fig = is_best_perf.vbt.select_levels("period").vbt.plot(
    trace_kwargs=dict(
        name="Best", 
        line=dict(color="limegreen", dash="dash")
    ), 
    fig=fig
)
fig = bl_perf.vbt.plot(
    trace_kwargs=dict(
        name="Baseline", 
        line=dict(color="orange", dash="dash")
    ), 
    fig=fig
)
fig = oos_perf.vbt.select_levels("period").vbt.plot(
    trace_kwargs=dict(
        name="Test", 
        line=dict(color="orangered")
    ), 
    fig=fig
)
fig.show_svg()
In [ ]:
is_perf_split6 = is_perf.xs(6, level="split")
is_perf_split6.describe()
In [ ]:
first_left_bound = period_ranges.loc[6, "start"]
first_right_bound = period_ranges.loc[6, "end"]
data[first_left_bound : first_right_bound].plot().show_svg()
In [ ]:
oos_perf.xs(6, level="period")
In [ ]:
is_perf_split6.quantile(0.25)

Splitter

In [ ]:
splitter = vbt.Splitter.from_rolling(
    data.index, 
    length=360, 
    split=0.5,
    set_labels=["IS", "OOS"]
)
splitter.plot().show_svg()

Schema

In [ ]:
print(splitter.splits)
In [ ]:
splitter.index
In [ ]:
splitter.wrapper.index
In [ ]:
splitter.wrapper.columns
In [ ]:
oos_splitter = splitter["OOS"]
print(oos_splitter.splits)

Range format

In [ ]:
index = vbt.date_range("2020", periods=14)
index[slice(1, 7)]
In [ ]:
index[1], index[6]
Relative
In [ ]:
rel_range = vbt.RelRange(offset=10, length=40)
rel_range
In [ ]:
rel_range.to_slice(total_len=len(splitter.index), prev_end=100)

Array format

In [ ]:
index = vbt.date_range("2020", "2021", freq="1min")
range_ = np.arange(len(index))
range_.nbytes / 1024 / 1024
In [ ]:
range_ = np.full(len(index), True)
range_.nbytes / 1024 / 1024
In [ ]:
splitter.splits_arr.dtype
In [ ]:
id(slice(0, 180, None))
In [ ]:
range_00 = np.arange(0, 5)
range_01 = np.arange(5, 15)
range_10 = np.arange(15, 30)
range_11 = np.arange(30, 50)

ind_splitter = vbt.Splitter.from_splits(
    data.index,
    [[range_00, range_01], [range_10, range_11]],
    fix_ranges=False
)
print(ind_splitter.splits)
In [ ]:
ind_splitter.splits.loc[0, "set_1"]
In [ ]:
ind_splitter.splits.loc[0, "set_1"].range_

Preparation

Splits

In [ ]:
vbt.Splitter.split_range(
    slice(None),
    (vbt.RelRange(length=0.75), vbt.RelRange()),
    index=data.index
)
In [ ]:
splitter.split_range(
    slice(None),
    (vbt.RelRange(length=0.75), vbt.RelRange())
)
In [ ]:
data[slice(0, 1426, None)]
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    0.75, 
    index=data.index
)
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    -0.25,
    index=data.index
)
In [ ]:
int(0.75 * len(data.index))
In [ ]:
len(data.index) - int(0.25 * len(data.index))
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    (vbt.RelRange(), vbt.RelRange(length=0.25)),
    backwards=True,
    index=data.index
)
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    (1.0, 30), 
    backwards=True,
    index=data.index
)
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    (
        vbt.RelRange(length=0.4, length_space="all"), 
        vbt.RelRange(length=0.4, length_space="all"),
        vbt.RelRange()
    ),
    index=data.index
)
In [ ]:
vbt.Splitter.split_range(
    slice(None),
    (vbt.RelRange(length=0.75), vbt.RelRange(offset=1)),
    index=data.index
)
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    (
        vbt.RelRange(length=0.75), 
        vbt.RelRange(length=1, is_gap=True),
        vbt.RelRange()
    ),
    index=data.index
)
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    (np.array([3, 4, 5]), np.array([6, 8, 10])),
    index=data.index
)
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    (np.array([3, 4, 5]), np.array([6, 8, 10])),
    range_format="indices",
    index=data.index
)
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    (slice("2020", "2021"), slice("2021", "2022")),
    index=data.index
)
In [ ]:
data.index[867:1233]
In [ ]:
data.index[1233:1598]
In [ ]:
vbt.Splitter.split_range(
    slice(None), 
    (
        vbt.RelRange(length="180 days"), 
        vbt.RelRange(offset="1 day", length="90 days")
    ),
    index=data.index
)

Method

In [ ]:
manual_splitter = vbt.Splitter.from_splits(
    data.index,
    [
        (vbt.RelRange(), vbt.RelRange(offset=0.5, length=0.25, length_space="all")),
        (vbt.RelRange(), vbt.RelRange(offset=0.25, length=0.25, length_space="all")),
        (vbt.RelRange(), vbt.RelRange(offset=0, length=0.25, length_space="all")),
    ],
    split_range_kwargs=dict(backwards=True),
    set_labels=["IS", "OOS"]
)
print(manual_splitter.splits)
In [ ]:
manual_splitter.plot().show_svg()

Generation

Rolling

In [ ]:
vbt.Splitter.from_rolling(
    data.index,
    length=360,
).plot().show_svg()
In [ ]:
vbt.Splitter.from_rolling(
    data.index, 
    length=360,
    offset=90
).plot().show_svg()
In [ ]:
vbt.Splitter.from_rolling(
    data.index, 
    length=360,
    offset=-0.5
).plot().show_svg()
In [ ]:
vbt.Splitter.from_rolling(
    data.index, 
    length=360,
    split=0.5
).plot().show_svg()
In [ ]:
vbt.Splitter.from_rolling(
    data.index,
    length=360,
    split=0.5,
    offset_anchor_set=None
).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_rolling(
    data.index,
    n=5,
    split=0.5
).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_rolling(
    data.index,
    n=3,
    length=360,
    split=0.5
).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_rolling(
    data.index,
    n=7,
    length=360,
    split=0.5
).plot().show_svg()
In [ ]:
vbt.Splitter.from_expanding(
    data.index, 
    min_length=360,
    offset=180,
    split=-180
).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_expanding(
    data.index, 
    n=5,
    min_length=360,
    split=-180
).plot().show_svg()

Anchored

In [ ]:
vbt.Splitter.from_ranges(
    data.index,
    every="Y",
    split=0.5
).plot().show_svg()
In [ ]:
vbt.Splitter.from_ranges(
    data.index,
    every="Q",
    lookback_period="Y",
    split=0.5
).plot().show_svg()
In [ ]:
vbt.Splitter.from_ranges(
    data.index,
    every="Q",
    lookback_period="Y",
    split=(
        vbt.RepEval("index.month != index.month[-1]"),
        vbt.RepEval("index.month == index.month[-1]")
    )
).plot().show_svg()
In [ ]:
def qyear(index):
    return index.to_period("Q")

vbt.Splitter.from_ranges(
    data.index,
    start=0,
    fixed_start=True,
    every="Q",
    closed_end=True,
    split=(
        lambda index: qyear(index) != qyear(index)[-1],
        lambda index: qyear(index) == qyear(index)[-1]
    )
).plot().show_svg()
In [ ]:
vbt.Splitter.from_grouper(
    data.index,
    by="Y",
    split=0.5
).plot().show_svg()
In [ ]:
def is_split_complete(index, split):
    first_range = split[0]
    first_index = index[first_range][0]
    last_range = split[-1]
    last_index = index[last_range][-1]
    return first_index.is_year_start and last_index.is_year_end

vbt.Splitter.from_grouper(
    data.index,
    by="Y",
    split=0.5,
    split_check_template=vbt.RepFunc(is_split_complete)
).plot().show_svg()
In [ ]:
def format_split_labels(index, splits_arr):
    years = map(lambda x: index[x[0]][0].year, splits_arr)
    return pd.Index(years, name="split_year")

vbt.Splitter.from_grouper(
    data.index,
    by="Y",
    split=0.5,
    split_check_template=vbt.RepFunc(is_split_complete),
    split_labels=vbt.RepFunc(format_split_labels)
).plot().show_svg()
In [ ]:
vbt.Splitter.from_grouper(
    data.index,
    by=data.index.year,
    split=0.5,
    split_check_template=vbt.RepFunc(is_split_complete)
).plot().show_svg()

Random

In [ ]:
vbt.Splitter.from_n_random(
    data.index,
    n=50,
    min_length=360,
    seed=42,
    split=0.5
).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_random(
    data.index,
    n=50,
    min_length=60,
    max_length=480,
    seed=42,
    split=0.5
).plot().show_svg()
In [ ]:
def start_p_func(i, indices):
    return indices / indices.sum()

vbt.Splitter.from_n_random(
    data.index,
    n=50,
    min_length=60,
    max_length=480,
    seed=42,
    start_p_func=start_p_func,
    split=0.5
).plot().show_svg()

Scikit-learn

In [ ]:
from sklearn.model_selection import KFold

vbt.Splitter.from_sklearn(
    data.index, 
    KFold(n_splits=5)
).plot().show_svg()

Dynamic

In [ ]:
def split_func(index, prev_start):
    if prev_start is None:
        prev_start = index[0]
    new_start = prev_start + pd.offsets.MonthBegin(1)
    new_end = new_start + pd.DateOffset(years=1)
    if new_end > index[-1] + index.freq:
        return None
    return [
        slice(new_start, new_start + pd.offsets.MonthBegin(9)),
        slice(new_start + pd.offsets.MonthBegin(9), new_end)
    ]

vbt.Splitter.from_split_func(
    data.index,
    split_func=split_func,
    split_args=(vbt.Rep("index"), vbt.Rep("prev_start")),
    range_bounds_kwargs=dict(index_bounds=True)
).plot().show_svg()
In [ ]:
def get_next_monday(from_date):
    if from_date.weekday == 0 and from_date.ceil("H").hour <= 9:
        return from_date.floor("D")
    return from_date.floor("D") + pd.offsets.Week(n=0, weekday=0)

def get_next_business_range(from_date):
    monday_0000 = get_next_monday(from_date)
    monday_0900 = monday_0000 + pd.DateOffset(hours=9)
    friday_1700 = monday_0900 + pd.DateOffset(days=4, hours=8)
    return slice(monday_0900, friday_1700)

def split_func(index, bounds):
    if len(bounds) == 0:
        from_date = index[0]
    else:
        from_date = bounds[-1][1][0]
    train_range = get_next_business_range(from_date)
    test_range = get_next_business_range(train_range.stop)
    if test_range.stop > index[-1] + index.freq:
        return None
    return train_range, test_range

vbt.Splitter.from_split_func(
    vbt.date_range("2020-01", "2020-03", freq="15min"),
    split_func=split_func,
    split_args=(vbt.Rep("index"), vbt.Rep("bounds")),
    range_bounds_kwargs=dict(index_bounds=True)
).plot().show_svg()

Validation

In [ ]:
splitter = vbt.Splitter.from_ranges(
    data.index,
    every="Y",
    closed_end=True,
    split=0.5,
    set_labels=["IS", "OOS"]
)
splitter.plot().show_svg()

Bounds

In [ ]:
bounds_arr = splitter.get_bounds_arr()
bounds_arr.shape
In [ ]:
print(bounds_arr)
In [ ]:
bounds = splitter.get_bounds(index_bounds=True)
bounds.shape
In [ ]:
print(bounds)
In [ ]:
bounds.loc[(0, "OOS"), "end"]
In [ ]:
bounds.loc[(1, "IS"), "start"]

Masks

In [ ]:
mask = splitter.get_mask()
mask.shape
In [ ]:
print(mask)
In [ ]:
mask["2021":"2021"].any()
In [ ]:
print(mask.resample(vbt.offset("Y")).sum())
In [ ]:
results = []
for mask in splitter.get_iter_split_masks():
    results.append(mask.resample(vbt.offset("Y")).sum())
print(pd.concat(results, axis=1, keys=splitter.split_labels))

Coverage

In [ ]:
splitter.get_split_coverage()
In [ ]:
splitter.get_set_coverage()
In [ ]:
splitter.get_range_coverage()
In [ ]:
splitter.get_coverage()
In [ ]:
splitter.index_bounds.loc[(2, "OOS"), "start"].is_leap_year
In [ ]:
splitter.get_range_coverage(relative=True)
In [ ]:
splitter.get_set_coverage(relative=True)
In [ ]:
splitter.get_split_coverage(overlapping=True)
In [ ]:
splitter.get_set_coverage(overlapping=True)
In [ ]:
splitter.get_coverage(overlapping=True)
In [ ]:
splitter.plot_coverage().show_svg()
In [ ]:
print(splitter.get_overlap_matrix(by="range", normalize=False))

Grouping

In [ ]:
print(splitter.get_bounds(index_bounds=True, set_group_by=True))

Manipulation

In [ ]:
splitter = vbt.Splitter.from_grouper(
    data.index, 
    by=data.index.year.rename("split_year")
)
In [ ]:
splitter.stats()
In [ ]:
splitter.plots().show_svg()
In [ ]:
splitter = splitter.iloc[1:-1]
splitter.stats()
In [ ]:
def new_split(index):
    return [
        np.isin(index.quarter, [1, 2]), 
        index.quarter == 3, 
        index.quarter == 4
    ]

splitter = splitter.split_set(
    vbt.RepFunc(new_split),
    new_set_labels=["train", "valid", "test"]
)
In [ ]:
splitter.stats()
In [ ]:
splitter.plots().show_svg()

Homework

In [ ]:
splitter = splitter.merge_sets(columns=["valid", "test"], new_set_label="test")
splitter.plots().show_svg()

Applications

Taking

Without stacking

In [ ]:
close_slices = splitter.take(data.close)
close_slices
In [ ]:
close_slices[2020, "test"]
In [ ]:
def get_total_return(sr):
    return sr.vbt.to_returns().vbt.returns.total()

close_slices.apply(get_total_return)
Complex objects
In [ ]:
trendlb = data.run("trendlb", 1.0, 0.5)
trendlb.plot().show_svg()
In [ ]:
grouper = pd.Index(trendlb.labels.map({1: "U", 0: "D"}), name="trend")
trend_splitter = vbt.Splitter.from_grouper(data.index, grouper)
trend_splitter.plot().show_svg()
In [ ]:
hold_pf = vbt.Portfolio.from_holding(data)
hold_returns_acc = hold_pf.returns_acc

fast_sma, slow_sma = vbt.talib("SMA").run_combs(
    data.close, np.arange(5, 50), short_names=["fast_sma", "slow_sma"])
entries = fast_sma.real_crossed_above(slow_sma)
exits = fast_sma.real_crossed_below(slow_sma)
strat_pf = vbt.Portfolio.from_signals(
    data, entries, exits, direction="both")
strat_returns_acc = strat_pf.returns_acc
In [ ]:
hold_returns_acc_slices = trend_splitter.take(hold_returns_acc)
strat_returns_acc_slices = trend_splitter.take(strat_returns_acc)
In [ ]:
hold_returns_acc_slices["U"].sharpe_ratio()
In [ ]:
strat_returns_acc_slices["U"].sharpe_ratio().vbt.heatmap(
    x_level="fast_sma_timeperiod", 
    y_level="slow_sma_timeperiod",
    symmetric=True
).show_svg()
In [ ]:
hold_returns_acc_slices["D"].sharpe_ratio()
In [ ]:
strat_returns_acc_slices["D"].sharpe_ratio().vbt.heatmap(
    x_level="fast_sma_timeperiod", 
    y_level="slow_sma_timeperiod",
    symmetric=True
).show_svg()
In [ ]:
trend_splitter = trend_splitter.break_up_splits("by_gap", sort=True)
trend_splitter.plot().show_svg()
In [ ]:
strat_pf_slices = strat_pf.split(trend_splitter)
strat_pf_slices
In [ ]:
trend_range_perf = strat_pf_slices.apply(lambda pf: pf.sharpe_ratio)
median_trend_perf = trend_range_perf.median(axis=1)
median_trend_perf
In [ ]:
trend_perf_ts = data.symbol_wrapper.fill().rename("trend_perf")
for label, sr in trend_splitter.bounds.iterrows():
    trend_perf_ts.iloc[sr["start"]:sr["end"]] = median_trend_perf[label]
data.close.vbt.overlay_with_heatmap(trend_perf_ts).show_svg()

Column stacking

In [ ]:
close_stacked = pd.concat(
    close_slices.values.tolist(), 
    axis=1, 
    keys=close_slices.index
)
print(close_stacked)
In [ ]:
get_total_return(close_stacked)
In [ ]:
close_stacked = splitter.take(data.close, into="stacked")
close_stacked.shape
In [ ]:
close_stacked = splitter.take(data.close, into="reset_stacked")
print(close_stacked)
In [ ]:
close_stacked = splitter.take(data.close, into="from_end_stacked")
print(close_stacked)
In [ ]:
close_stacked = splitter.take(data.close, into="reset_stacked_by_set")
close_stacked
In [ ]:
print(close_stacked["train"])
In [ ]:
print(182 * 8)
print(1461 * 8)
print(1 - 1456 / 11688)
In [ ]:
index_slices = splitter.take(data.index)
index_slices
In [ ]:
close_stacked_wb = splitter.take(
    data.close, 
    into="reset_stacked_by_set",
    attach_bounds="index",
    right_inclusive=True
)
print(close_stacked_wb["train"])
In [ ]:
@vbt.parameterized(merge_func="concat")
def set_sma_crossover_perf(close, fast_window, slow_window, freq):
    fast_sma = vbt.talib("sma").run(
        close, fast_window, short_name="fast_sma", hide_params=True) 
    slow_sma = vbt.talib("sma").run(
        close, slow_window, short_name="slow_sma", hide_params=True) 
    entries = fast_sma.real_crossed_above(slow_sma)
    exits = fast_sma.real_crossed_below(slow_sma)
    pf = vbt.Portfolio.from_signals(
        close, entries, exits, freq=freq, direction="both")
    return pf.sharpe_ratio
In [ ]:
train_perf = set_sma_crossover_perf(
    close_stacked["train"],
    vbt.Param(np.arange(5, 50), condition="x < slow_window"),
    vbt.Param(np.arange(5, 50)),
    data.index.freq,
    _execute_kwargs=dict(
        clear_cache=50,
        collect_garbage=50
    )
)
In [ ]:
train_perf
In [ ]:
train_perf.vbt.heatmap(
    x_level="fast_window",
    y_level="slow_window",
    slider_level="split_year",
    symmetric=True
).show_svg()  # replace with show()
In [ ]:
@njit
def prox_median_nb(arr):
    if (~np.isnan(arr)).sum() < 20:
        return np.nan
    return np.nanmedian(arr)

prox_perf_list = []
for split_label, perf_sr in train_perf.groupby("split_year"):
    perf_df = perf_sr.vbt.unstack_to_df(0, [1, 2])
    prox_perf_df = perf_df.vbt.proximity_apply(2, prox_median_nb)
    prox_perf_sr = prox_perf_df.stack([0, 1])
    prox_perf_list.append(prox_perf_sr.reindex(perf_sr.index))

train_prox_perf = pd.concat(prox_perf_list)
train_prox_perf
In [ ]:
train_prox_perf.vbt.heatmap(
    x_level="fast_window",
    y_level="slow_window",
    slider_level="split_year",
    symmetric=True
).show_svg()  # replace with show()
In [ ]:
best_params = train_prox_perf.groupby("split_year").idxmax()
best_params = train_prox_perf[best_params].index
train_prox_perf[best_params]
In [ ]:
test_perf = set_sma_crossover_perf(
    vbt.RepEval(
        "test_close.iloc[:, [config_idx]]", 
        context=dict(test_close=close_stacked["test"])
    ),
    vbt.Param(best_params.get_level_values("fast_window"), level=0),
    vbt.Param(best_params.get_level_values("slow_window"), level=0),
    data.index.freq
)
test_perf
In [ ]:
def get_index_sharpe(index):
    return data.loc[index].run("from_holding").sharpe_ratio

index_slices.xs("test", level="set").apply(get_index_sharpe)

Row stacking

In [ ]:
block_size = int(3.15 * len(data.index) ** (1 / 3))
block_splitter = vbt.Splitter.from_rolling(
    data.index, 
    length=block_size, 
    offset=1,
    offset_anchor="prev_start"
)
block_splitter.n_splits
In [ ]:
size = int(block_splitter.n_splits / block_size)
sample_splitter = block_splitter.shuffle_splits(size=size, replace=True)
sample_splitter.plot().show_svg()
In [ ]:
returns = data.returns
sample_rets = sample_splitter.take(returns, into="stacked", stack_axis=0)
sample_rets
In [ ]:
sample_rets.index = data.index[:len(sample_rets)]
sample_cumrets = data.close[0] * (sample_rets + 1).cumprod()
sample_cumrets.vbt.plot().show_svg()
In [ ]:
samples_rets_list = []
for i in vbt.ProgressBar(range(1000)):
    sample_spl = block_splitter.shuffle_splits(size=size, replace=True)
    sample_rets = sample_spl.take(returns, into="stacked", stack_axis=0)
    sample_rets.index = returns.index[:len(sample_rets)]
    sample_rets.name = i
    samples_rets_list.append(sample_rets)
sample_rets_stacked = pd.concat(samples_rets_list, axis=1)
In [ ]:
sample_sharpe = sample_rets_stacked.vbt.returns.sharpe_ratio()
sample_sharpe.vbt.boxplot(horizontal=True).show_svg()
In [ ]:
sample_sharpe.quantile(0.025), sample_sharpe.quantile(0.975)

Applying

In [ ]:
splitter.apply(
    get_total_return,
    vbt.Takeable(data.close),
    merge_func="concat"
)
In [ ]:
splitter.apply(
    get_total_return,
    vbt.RepFunc(lambda range_: data.close[range_]),
    merge_func="concat"
)
In [ ]:
def get_total_return(range_, data):
    return data.returns[range_].vbt.returns.total()

splitter.apply(
    get_total_return,
    vbt.Rep("range_"),
    data,
    merge_func="concat"
)
In [ ]:
def get_total_return(data):
    return data.returns.vbt.returns.total()

splitter.apply(
    get_total_return,
    vbt.Takeable(data),
    merge_func="concat"
)
In [ ]:
splitter.apply(
    get_total_return,
    vbt.Takeable(data),
    set_group_by=True,
    merge_func="concat"
)
In [ ]:
splitter.apply(
    get_total_return,
    vbt.Takeable(data),
    split=[2020, 2021],
    set_="train",
    merge_func="concat"
)
In [ ]:
train_perf = splitter.apply(
    sma_crossover_perf,
    vbt.Takeable(data),
    vbt.Param(np.arange(5, 50), condition="x < slow_window"),
    vbt.Param(np.arange(5, 50)),
    _execute_kwargs=dict(
        clear_cache=50,
        collect_garbage=50
    ),
    set_="train",
    merge_func="concat",
)
In [ ]:
train_perf
In [ ]:
best_params = train_perf.groupby("split_year").idxmax()
best_params = train_perf[best_params].index
train_perf[best_params]
In [ ]:
best_fast_windows = best_params.get_level_values("fast_window")
best_slow_windows = best_params.get_level_values("slow_window")

test_perf = splitter.apply(
    sma_crossover_perf,
    vbt.Takeable(data),
    vbt.RepFunc(lambda split_idx: best_fast_windows[split_idx]),
    vbt.RepFunc(lambda split_idx: best_slow_windows[split_idx]),
    set_="test",
    merge_func="concat"
)
test_perf

Iteration schemes

In [ ]:
def cv_sma_crossover(
    data, 
    fast_windows, 
    slow_windows, 
    split_idx,
    set_idx,
    train_perf_list
):
    if set_idx == 0:
        train_perf = sma_crossover_perf(
            data,
            vbt.Param(fast_windows, condition="x < slow_window"),
            vbt.Param(slow_windows),
            _execute_kwargs=dict(
                clear_cache=50,
                collect_garbage=50
            )
        )
        train_perf_list.append(train_perf)
        best_params = train_perf.idxmax()
        return train_perf[[best_params]]
    else:
        train_perf = train_perf_list[split_idx]
        best_params = train_perf.idxmax()
        test_perf = sma_crossover_perf(
            data,
            vbt.Param([best_params[0]]),
            vbt.Param([best_params[1]]),
        )
        return test_perf
    
train_perf_list = []
cv_perf = splitter.apply(
    cv_sma_crossover,
    vbt.Takeable(data),
    np.arange(5, 50),
    np.arange(5, 50),
    vbt.Rep("split_idx"),
    vbt.Rep("set_idx"),
    train_perf_list,
    iteration="set_major",
    merge_func="concat",
)
In [ ]:
train_perf = pd.concat(train_perf_list, keys=splitter.split_labels)
train_perf
In [ ]:
cv_perf

Merging

In [ ]:
def get_entries_and_exits(data, fast_window, slow_window):
    fast_sma = data.run("sma", fast_window, short_name="fast_sma")
    slow_sma = data.run("sma", slow_window, short_name="slow_sma")
    entries = fast_sma.real_crossed_above(slow_sma)
    exits = fast_sma.real_crossed_below(slow_sma)
    return entries, exits

entries, exits = splitter.apply(
    get_entries_and_exits,
    vbt.Takeable(data),
    20,
    30,
    merge_func="column_stack"
)

print(entries)
In [ ]:
entries, exits = splitter.apply(
    get_entries_and_exits,
    vbt.Takeable(data),
    20,
    30,
    merge_all=False,
    merge_func="row_stack"
)

entries.loc[2018]
In [ ]:
def get_signal_count(*args, **kwargs):
    entries, exits = get_entries_and_exits(*args, **kwargs)
    return entries.vbt.signals.total(), exits.vbt.signals.total()

entry_count, exit_count = splitter.apply(
    get_signal_count,
    vbt.Takeable(data),
    20,
    30,
    merge_func="concat",
    attach_bounds="index"
)
entry_count
In [ ]:
def plot_entries_and_exits(results, data, keys):
    set_labels = keys.get_level_values("set")
    fig = data.plot(plot_volume=False)
    train_seen = False
    test_seen = False

    for i in range(len(results)):
        entries, exits = results[i]
        set_label = set_labels[i]
        if set_label == "train":
            entries.vbt.signals.plot_as_entries(
                data.close,
                trace_kwargs=dict(
                    marker=dict(color="limegreen"), 
                    name=f"Entries ({set_label})",
                    legendgroup=f"Entries ({set_label})",
                    showlegend=not train_seen
                ),
                fig=fig
            ),
            exits.vbt.signals.plot_as_exits(
                data.close,
                trace_kwargs=dict(
                    marker=dict(color="orange"), 
                    name=f"Exits ({set_label})",
                    legendgroup=f"Exits ({set_label})",
                    showlegend=not train_seen
                ),
                fig=fig
            )
            train_seen = True
        else:
            entries.vbt.signals.plot_as_entries(
                data.close,
                trace_kwargs=dict(
                    marker=dict(color="skyblue"), 
                    name=f"Entries ({set_label})",
                    legendgroup=f"Entries ({set_label})",
                    showlegend=not test_seen
                ),
                fig=fig
            ),
            exits.vbt.signals.plot_as_exits(
                data.close,
                trace_kwargs=dict(
                    marker=dict(color="magenta"), 
                    name=f"Exits ({set_label})",
                    legendgroup=f"Entries ({set_label})",
                    showlegend=not test_seen
                ),
                fig=fig
            )
            test_seen = True
    return fig

splitter.apply(
    get_entries_and_exits,
    vbt.Takeable(data),
    20,
    30,
    merge_func=plot_entries_and_exits,
    merge_kwargs=dict(data=data, keys=vbt.Rep("keys")),
).show_svg()

Decorators

In [ ]:
@vbt.split(splitter=splitter)
def get_split_total_return(data):
    return data.returns.vbt.returns.total()

get_split_total_return(vbt.Takeable(data))
In [ ]:
def get_total_return(data):
    return data.returns.vbt.returns.total()

get_split_total_return = vbt.split(
    get_total_return, 
    splitter=splitter
)
get_split_total_return(vbt.Takeable(data))
In [ ]:
@vbt.split
def get_split_total_return(data):
    return data.returns.vbt.returns.total()

get_split_total_return(vbt.Takeable(data), _splitter=splitter)
In [ ]:
get_split_total_return(
    vbt.Takeable(data.loc["2020":"2020"]), 
    _splitter="from_rolling", 
    _splitter_kwargs=dict(length="30d")
)
In [ ]:
get_total_return_by_month = vbt.split(
    get_total_return,
    splitter="from_grouper", 
    splitter_kwargs=dict(by=vbt.RepEval("index.to_period('M')")),
    takeable_args=["data"]
)

get_total_return_by_month(data.loc["2020":"2020"])
In [ ]:
cv_sma_crossover_perf = vbt.split(
    sma_crossover_perf, 
    splitter="from_single",
    splitter_kwargs=dict(split=0.6, set_labels=["train", "test"]),
    takeable_args=["data"],
    merge_func="concat",
)
In [ ]:
train_perf = cv_sma_crossover_perf(
    data.loc["2020":"2021"],
    vbt.Param(np.arange(5, 50), condition="x < slow_window"),
    vbt.Param(np.arange(5, 50)),
    p_execute_kwargs=dict(
        clear_cache=50,
        collect_garbage=50
    ),
    _forward_kwargs_as={
        "p_execute_kwargs": "_execute_kwargs"
    },
    _apply_kwargs=dict(set_="train")
)
In [ ]:
train_perf
In [ ]:
test_perf = cv_sma_crossover_perf(
    data.loc["2020":"2021"],
    train_perf.idxmax()[0],
    train_perf.idxmax()[1],
    _apply_kwargs=dict(set_="test")
)
In [ ]:
test_perf
In [ ]:
@njit(nogil=True)
def sma_crossover_perf_nb(close, fast_window, slow_window, ann_factor):
    fast_sma = vbt.nb.ma_nb(close, fast_window)
    slow_sma = vbt.nb.ma_nb(close, slow_window)
    entries = vbt.nb.crossed_above_nb(fast_sma, slow_sma)
    exits = vbt.nb.crossed_above_nb(slow_sma, fast_sma)
    sim_out = vbt.pf_nb.from_signals_nb(
        target_shape=close.shape,
        group_lens=np.full(close.shape[1], 1),
        close=close,
        long_entries=entries,
        short_entries=exits,
        save_returns=True
    )
    return vbt.ret_nb.sharpe_ratio_nb(
        sim_out.in_outputs.returns, 
        ann_factor
    )
In [ ]:
sma_crossover_perf_nb(vbt.to_2d_array(data.close), 20, 30, 365)
In [ ]:
cv_sma_crossover_perf = vbt.cv_split(
    sma_crossover_perf_nb,
    splitter="from_rolling",
    splitter_kwargs=dict(
        length=360, 
        split=0.5, 
        set_labels=["train", "test"]
    ),
    takeable_args=["close"],
    merge_func="concat",
    parameterized_kwargs=dict(
        engine="dask", 
        chunk_len="auto",
    )
)

grid_perf, best_perf = cv_sma_crossover_perf(
    vbt.to_2d_array(data.close),
    vbt.Param(np.arange(5, 50), condition="x < slow_window"),
    vbt.Param(np.arange(5, 50)),
    pd.Timedelta(days=365) // data.index.freq,
    _merge_kwargs=dict(wrapper=data.symbol_wrapper),
    _index=data.index,
    _return_grid="all"
)
In [ ]:
grid_perf
In [ ]:
best_perf
In [ ]:
best_train_perf = best_perf.xs("train", level="set")
best_test_perf = best_perf.xs("test", level="set")
best_train_perf.corr(best_test_perf)
In [ ]:
param_cross_set_corr = grid_perf\
    .unstack("set")\
    .groupby(["fast_window", "slow_window"])\
    .apply(lambda x: x["train"].corr(x["test"]))
param_cross_set_corr.vbt.heatmap(symmetric=True).show_svg()
In [ ]:
grid_test_perf = grid_perf.xs("test", level="set")
grid_df = grid_test_perf.rename("grid").reset_index()
del grid_df["fast_window"]
del grid_df["slow_window"]
best_df = best_test_perf.rename("best").reset_index()
del best_df["fast_window"]
del best_df["slow_window"]
merged_df = pd.merge(grid_df, best_df, on=["split", "symbol"])
grid_better_mask = merged_df["grid"] > merged_df["best"]
grid_better_mask.index = grid_test_perf.index
grid_better_cnt = grid_better_mask.groupby(["split", "symbol"]).mean()
grid_better_cnt
In [ ]:
cv_splitter = cv_sma_crossover_perf(
    _index=data.index, 
    _return_splitter=True
)
stacked_close = cv_splitter.take(
    data.close, 
    into="reset_stacked",
    set_="test"
)
hold_pf = vbt.Portfolio.from_holding(stacked_close, freq="daily")
hold_perf = hold_pf.sharpe_ratio
hold_perf

Modeling

In [ ]:
X = data.run("talib")
X.shape
In [ ]:
trendlb = data.run("trendlb", 1.0, 0.5, mode="binary")
y = trendlb.labels
y.shape
In [ ]:
X = X.replace([-np.inf, np.inf], np.nan)
invalid_column_mask = X.isnull().all(axis=0) | (X.nunique() == 1)
X = X.loc[:, ~invalid_column_mask]
invalid_row_mask = X.isnull().any(axis=1) | y.isnull()
X = X.loc[~invalid_row_mask]
y = y.loc[~invalid_row_mask]
X.shape, y.shape
In [ ]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
In [ ]:
cv = vbt.SplitterCV(
    "from_expanding", 
    min_length=360, 
    offset=180, 
    split=-180,
    set_labels=["train", "test"]
)

cv_splitter = cv.get_splitter(X)
cv_splitter.plot().show_svg()
In [ ]:
from sklearn.model_selection import cross_val_score

cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
In [ ]:
X_slices = cv_splitter.take(X)
y_slices = cv_splitter.take(y)
In [ ]:
test_labels = []
test_preds = []
for split in X_slices.index.unique(level="split"):
    X_train_slice = X_slices[(split, "train")]
    y_train_slice = y_slices[(split, "train")]
    X_test_slice = X_slices[(split, "test")]
    y_test_slice = y_slices[(split, "test")]
    slice_clf = clf.fit(X_train_slice, y_train_slice)
    test_pred = slice_clf.predict(X_test_slice)
    test_pred = pd.Series(test_pred, index=y_test_slice.index)
    test_labels.append(y_test_slice)
    test_preds.append(test_pred)
    
test_labels = pd.concat(test_labels).rename("labels")
test_preds = pd.concat(test_preds).rename("preds")
In [ ]:
data.close.vbt.overlay_with_heatmap(test_labels).show_svg()
In [ ]:
data.close.vbt.overlay_with_heatmap(test_preds).show_svg()
In [ ]:
pf = vbt.Portfolio.from_signals(
    data.close[test_preds.index], 
    test_preds == 1, 
    test_preds == 0, 
    direction="both"
)
pf.stats()
In [ ]: