81 KiB
81 KiB
Cross-validation¶
In [ ]:
from vectorbtpro import * # whats_imported() vbt.settings.set_theme("dark")
In [ ]:
data = vbt.BinanceData.pull("BTCUSDT", end="2022-11-01 UTC") data.index
In [ ]:
@vbt.parameterized(merge_func="concat") def sma_crossover_perf(data, fast_window, slow_window): fast_sma = data.run("sma", fast_window, short_name="fast_sma") slow_sma = data.run("sma", slow_window, short_name="slow_sma") entries = fast_sma.real_crossed_above(slow_sma) exits = fast_sma.real_crossed_below(slow_sma) pf = vbt.Portfolio.from_signals( data, entries, exits, direction="both") return pf.sharpe_ratio
In [ ]:
perf = sma_crossover_perf( data["2020":"2020"], vbt.Param(np.arange(5, 50), condition="x < slow_window"), vbt.Param(np.arange(5, 50)), _execute_kwargs=dict( clear_cache=50, collect_garbage=50 ) ) perf
In [ ]:
perf.sort_values(ascending=False)
In [ ]:
best_fast_window, best_slow_window = perf.idxmax() sma_crossover_perf( data["2021":"2021"], best_fast_window, best_slow_window )
In [ ]:
data["2021":"2021"].run("from_holding").sharpe_ratio
In [ ]:
start_index = data.index[0] period = pd.Timedelta(days=180) all_is_bounds = {} all_is_bl_perf = {} all_is_perf = {} all_oos_bounds = {} all_oos_bl_perf = {} all_oos_perf = {} split_idx = 0 period_idx = 0 with vbt.ProgressBar() as pbar: while start_index + 2 * period <= data.index[-1]: pbar.set_prefix(str(start_index)) is_start_index = start_index is_end_index = start_index + period - pd.Timedelta(nanoseconds=1) is_data = data[is_start_index : is_end_index] is_bl_perf = is_data.run("from_holding").sharpe_ratio is_perf = sma_crossover_perf( is_data, vbt.Param(np.arange(5, 50), condition="x < slow_window"), vbt.Param(np.arange(5, 50)), _execute_kwargs=dict( clear_cache=50, collect_garbage=50 ) ) oos_start_index = start_index + period oos_end_index = start_index + 2 * period - pd.Timedelta(nanoseconds=1) oos_data = data[oos_start_index : oos_end_index] oos_bl_perf = oos_data.run("from_holding").sharpe_ratio best_fw, best_sw = is_perf.idxmax() oos_perf = sma_crossover_perf(oos_data, best_fw, best_sw) oos_perf_index = is_perf.index[is_perf.index == (best_fw, best_sw)] oos_perf = pd.Series([oos_perf], index=oos_perf_index) all_is_bounds[period_idx] = (is_start_index, is_end_index) all_oos_bounds[period_idx + 1] = (oos_start_index, oos_end_index) all_is_bl_perf[(split_idx, period_idx)] = is_bl_perf all_oos_bl_perf[(split_idx, period_idx + 1)] = oos_bl_perf all_is_perf[(split_idx, period_idx)] = is_perf all_oos_perf[(split_idx, period_idx + 1)] = oos_perf start_index = start_index + period split_idx += 1 period_idx += 1 pbar.update()
In [ ]:
is_period_ranges = pd.DataFrame.from_dict( all_is_bounds, orient="index", columns=["start", "end"] ) is_period_ranges.index.name = "period" oos_period_ranges = pd.DataFrame.from_dict( all_oos_bounds, orient="index", columns=["start", "end"] ) oos_period_ranges.index.name = "period" period_ranges = pd.concat((is_period_ranges, oos_period_ranges)) period_ranges = period_ranges.drop_duplicates() print(period_ranges)
In [ ]:
is_bl_perf = pd.Series(all_is_bl_perf) is_bl_perf.index.names = ["split", "period"] oos_bl_perf = pd.Series(all_oos_bl_perf) oos_bl_perf.index.names = ["split", "period"] bl_perf = pd.concat(( is_bl_perf.vbt.select_levels("period"), oos_bl_perf.vbt.select_levels("period") )) bl_perf = bl_perf.drop_duplicates() bl_perf
In [ ]:
is_perf = pd.concat(all_is_perf, names=["split", "period"]) is_perf
In [ ]:
oos_perf = pd.concat(all_oos_perf, names=["split", "period"]) oos_perf
In [ ]:
is_best_mask = is_perf.index.vbt.drop_levels("period").isin( oos_perf.index.vbt.drop_levels("period")) is_best_perf = is_perf[is_best_mask] is_best_perf
In [ ]:
print(pd.concat(( is_perf.describe(), is_best_perf.describe(), is_bl_perf.describe(), oos_perf.describe(), oos_bl_perf.describe() ), axis=1, keys=[ "IS", "IS (Best)", "IS (Baseline)", "OOS (Test)", "OOS (Baseline)" ]))
In [ ]:
fig = is_perf.vbt.boxplot( by_level="period", trace_kwargs=dict( line=dict(color="lightskyblue"), opacity=0.4, showlegend=False ), xaxis_title="Period", yaxis_title="Sharpe", ) fig = is_best_perf.vbt.select_levels("period").vbt.plot( trace_kwargs=dict( name="Best", line=dict(color="limegreen", dash="dash") ), fig=fig ) fig = bl_perf.vbt.plot( trace_kwargs=dict( name="Baseline", line=dict(color="orange", dash="dash") ), fig=fig ) fig = oos_perf.vbt.select_levels("period").vbt.plot( trace_kwargs=dict( name="Test", line=dict(color="orangered") ), fig=fig ) fig.show_svg()
In [ ]:
is_perf_split6 = is_perf.xs(6, level="split") is_perf_split6.describe()
In [ ]:
first_left_bound = period_ranges.loc[6, "start"] first_right_bound = period_ranges.loc[6, "end"] data[first_left_bound : first_right_bound].plot().show_svg()
In [ ]:
oos_perf.xs(6, level="period")
In [ ]:
is_perf_split6.quantile(0.25)
Splitter¶
In [ ]:
splitter = vbt.Splitter.from_rolling( data.index, length=360, split=0.5, set_labels=["IS", "OOS"] ) splitter.plot().show_svg()
Schema¶
In [ ]:
print(splitter.splits)
In [ ]:
splitter.index
In [ ]:
splitter.wrapper.index
In [ ]:
splitter.wrapper.columns
In [ ]:
oos_splitter = splitter["OOS"] print(oos_splitter.splits)
Range format¶
In [ ]:
index = vbt.date_range("2020", periods=14) index[slice(1, 7)]
In [ ]:
index[1], index[6]
Relative¶
In [ ]:
rel_range = vbt.RelRange(offset=10, length=40) rel_range
In [ ]:
rel_range.to_slice(total_len=len(splitter.index), prev_end=100)
Array format¶
In [ ]:
index = vbt.date_range("2020", "2021", freq="1min") range_ = np.arange(len(index)) range_.nbytes / 1024 / 1024
In [ ]:
range_ = np.full(len(index), True) range_.nbytes / 1024 / 1024
In [ ]:
splitter.splits_arr.dtype
In [ ]:
id(slice(0, 180, None))
In [ ]:
range_00 = np.arange(0, 5) range_01 = np.arange(5, 15) range_10 = np.arange(15, 30) range_11 = np.arange(30, 50) ind_splitter = vbt.Splitter.from_splits( data.index, [[range_00, range_01], [range_10, range_11]], fix_ranges=False ) print(ind_splitter.splits)
In [ ]:
ind_splitter.splits.loc[0, "set_1"]
In [ ]:
ind_splitter.splits.loc[0, "set_1"].range_
Preparation¶
Splits¶
In [ ]:
vbt.Splitter.split_range( slice(None), (vbt.RelRange(length=0.75), vbt.RelRange()), index=data.index )
In [ ]:
splitter.split_range( slice(None), (vbt.RelRange(length=0.75), vbt.RelRange()) )
In [ ]:
data[slice(0, 1426, None)]
In [ ]:
vbt.Splitter.split_range( slice(None), 0.75, index=data.index )
In [ ]:
vbt.Splitter.split_range( slice(None), -0.25, index=data.index )
In [ ]:
int(0.75 * len(data.index))
In [ ]:
len(data.index) - int(0.25 * len(data.index))
In [ ]:
vbt.Splitter.split_range( slice(None), (vbt.RelRange(), vbt.RelRange(length=0.25)), backwards=True, index=data.index )
In [ ]:
vbt.Splitter.split_range( slice(None), (1.0, 30), backwards=True, index=data.index )
In [ ]:
vbt.Splitter.split_range( slice(None), ( vbt.RelRange(length=0.4, length_space="all"), vbt.RelRange(length=0.4, length_space="all"), vbt.RelRange() ), index=data.index )
In [ ]:
vbt.Splitter.split_range( slice(None), (vbt.RelRange(length=0.75), vbt.RelRange(offset=1)), index=data.index )
In [ ]:
vbt.Splitter.split_range( slice(None), ( vbt.RelRange(length=0.75), vbt.RelRange(length=1, is_gap=True), vbt.RelRange() ), index=data.index )
In [ ]:
vbt.Splitter.split_range( slice(None), (np.array([3, 4, 5]), np.array([6, 8, 10])), index=data.index )
In [ ]:
vbt.Splitter.split_range( slice(None), (np.array([3, 4, 5]), np.array([6, 8, 10])), range_format="indices", index=data.index )
In [ ]:
vbt.Splitter.split_range( slice(None), (slice("2020", "2021"), slice("2021", "2022")), index=data.index )
In [ ]:
data.index[867:1233]
In [ ]:
data.index[1233:1598]
In [ ]:
vbt.Splitter.split_range( slice(None), ( vbt.RelRange(length="180 days"), vbt.RelRange(offset="1 day", length="90 days") ), index=data.index )
Method¶
In [ ]:
manual_splitter = vbt.Splitter.from_splits( data.index, [ (vbt.RelRange(), vbt.RelRange(offset=0.5, length=0.25, length_space="all")), (vbt.RelRange(), vbt.RelRange(offset=0.25, length=0.25, length_space="all")), (vbt.RelRange(), vbt.RelRange(offset=0, length=0.25, length_space="all")), ], split_range_kwargs=dict(backwards=True), set_labels=["IS", "OOS"] ) print(manual_splitter.splits)
In [ ]:
manual_splitter.plot().show_svg()
Generation¶
Rolling¶
In [ ]:
vbt.Splitter.from_rolling( data.index, length=360, ).plot().show_svg()
In [ ]:
vbt.Splitter.from_rolling( data.index, length=360, offset=90 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_rolling( data.index, length=360, offset=-0.5 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_rolling( data.index, length=360, split=0.5 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_rolling( data.index, length=360, split=0.5, offset_anchor_set=None ).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_rolling( data.index, n=5, split=0.5 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_rolling( data.index, n=3, length=360, split=0.5 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_rolling( data.index, n=7, length=360, split=0.5 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_expanding( data.index, min_length=360, offset=180, split=-180 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_expanding( data.index, n=5, min_length=360, split=-180 ).plot().show_svg()
Anchored¶
In [ ]:
vbt.Splitter.from_ranges( data.index, every="Y", split=0.5 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_ranges( data.index, every="Q", lookback_period="Y", split=0.5 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_ranges( data.index, every="Q", lookback_period="Y", split=( vbt.RepEval("index.month != index.month[-1]"), vbt.RepEval("index.month == index.month[-1]") ) ).plot().show_svg()
In [ ]:
def qyear(index): return index.to_period("Q") vbt.Splitter.from_ranges( data.index, start=0, fixed_start=True, every="Q", closed_end=True, split=( lambda index: qyear(index) != qyear(index)[-1], lambda index: qyear(index) == qyear(index)[-1] ) ).plot().show_svg()
In [ ]:
vbt.Splitter.from_grouper( data.index, by="Y", split=0.5 ).plot().show_svg()
In [ ]:
def is_split_complete(index, split): first_range = split[0] first_index = index[first_range][0] last_range = split[-1] last_index = index[last_range][-1] return first_index.is_year_start and last_index.is_year_end vbt.Splitter.from_grouper( data.index, by="Y", split=0.5, split_check_template=vbt.RepFunc(is_split_complete) ).plot().show_svg()
In [ ]:
def format_split_labels(index, splits_arr): years = map(lambda x: index[x[0]][0].year, splits_arr) return pd.Index(years, name="split_year") vbt.Splitter.from_grouper( data.index, by="Y", split=0.5, split_check_template=vbt.RepFunc(is_split_complete), split_labels=vbt.RepFunc(format_split_labels) ).plot().show_svg()
In [ ]:
vbt.Splitter.from_grouper( data.index, by=data.index.year, split=0.5, split_check_template=vbt.RepFunc(is_split_complete) ).plot().show_svg()
Random¶
In [ ]:
vbt.Splitter.from_n_random( data.index, n=50, min_length=360, seed=42, split=0.5 ).plot().show_svg()
In [ ]:
vbt.Splitter.from_n_random( data.index, n=50, min_length=60, max_length=480, seed=42, split=0.5 ).plot().show_svg()
In [ ]:
def start_p_func(i, indices): return indices / indices.sum() vbt.Splitter.from_n_random( data.index, n=50, min_length=60, max_length=480, seed=42, start_p_func=start_p_func, split=0.5 ).plot().show_svg()
Scikit-learn¶
In [ ]:
from sklearn.model_selection import KFold vbt.Splitter.from_sklearn( data.index, KFold(n_splits=5) ).plot().show_svg()
Dynamic¶
In [ ]:
def split_func(index, prev_start): if prev_start is None: prev_start = index[0] new_start = prev_start + pd.offsets.MonthBegin(1) new_end = new_start + pd.DateOffset(years=1) if new_end > index[-1] + index.freq: return None return [ slice(new_start, new_start + pd.offsets.MonthBegin(9)), slice(new_start + pd.offsets.MonthBegin(9), new_end) ] vbt.Splitter.from_split_func( data.index, split_func=split_func, split_args=(vbt.Rep("index"), vbt.Rep("prev_start")), range_bounds_kwargs=dict(index_bounds=True) ).plot().show_svg()
In [ ]:
def get_next_monday(from_date): if from_date.weekday == 0 and from_date.ceil("H").hour <= 9: return from_date.floor("D") return from_date.floor("D") + pd.offsets.Week(n=0, weekday=0) def get_next_business_range(from_date): monday_0000 = get_next_monday(from_date) monday_0900 = monday_0000 + pd.DateOffset(hours=9) friday_1700 = monday_0900 + pd.DateOffset(days=4, hours=8) return slice(monday_0900, friday_1700) def split_func(index, bounds): if len(bounds) == 0: from_date = index[0] else: from_date = bounds[-1][1][0] train_range = get_next_business_range(from_date) test_range = get_next_business_range(train_range.stop) if test_range.stop > index[-1] + index.freq: return None return train_range, test_range vbt.Splitter.from_split_func( vbt.date_range("2020-01", "2020-03", freq="15min"), split_func=split_func, split_args=(vbt.Rep("index"), vbt.Rep("bounds")), range_bounds_kwargs=dict(index_bounds=True) ).plot().show_svg()
Validation¶
In [ ]:
splitter = vbt.Splitter.from_ranges( data.index, every="Y", closed_end=True, split=0.5, set_labels=["IS", "OOS"] ) splitter.plot().show_svg()
Bounds¶
In [ ]:
bounds_arr = splitter.get_bounds_arr() bounds_arr.shape
In [ ]:
print(bounds_arr)
In [ ]:
bounds = splitter.get_bounds(index_bounds=True) bounds.shape
In [ ]:
print(bounds)
In [ ]:
bounds.loc[(0, "OOS"), "end"]
In [ ]:
bounds.loc[(1, "IS"), "start"]
Masks¶
In [ ]:
mask = splitter.get_mask() mask.shape
In [ ]:
print(mask)
In [ ]:
mask["2021":"2021"].any()
In [ ]:
print(mask.resample(vbt.offset("Y")).sum())
In [ ]:
results = [] for mask in splitter.get_iter_split_masks(): results.append(mask.resample(vbt.offset("Y")).sum()) print(pd.concat(results, axis=1, keys=splitter.split_labels))
Coverage¶
In [ ]:
splitter.get_split_coverage()
In [ ]:
splitter.get_set_coverage()
In [ ]:
splitter.get_range_coverage()
In [ ]:
splitter.get_coverage()
In [ ]:
splitter.index_bounds.loc[(2, "OOS"), "start"].is_leap_year
In [ ]:
splitter.get_range_coverage(relative=True)
In [ ]:
splitter.get_set_coverage(relative=True)
In [ ]:
splitter.get_split_coverage(overlapping=True)
In [ ]:
splitter.get_set_coverage(overlapping=True)
In [ ]:
splitter.get_coverage(overlapping=True)
In [ ]:
splitter.plot_coverage().show_svg()
In [ ]:
print(splitter.get_overlap_matrix(by="range", normalize=False))
Grouping¶
In [ ]:
print(splitter.get_bounds(index_bounds=True, set_group_by=True))
Manipulation¶
In [ ]:
splitter = vbt.Splitter.from_grouper( data.index, by=data.index.year.rename("split_year") )
In [ ]:
splitter.stats()
In [ ]:
splitter.plots().show_svg()
In [ ]:
splitter = splitter.iloc[1:-1] splitter.stats()
In [ ]:
def new_split(index): return [ np.isin(index.quarter, [1, 2]), index.quarter == 3, index.quarter == 4 ] splitter = splitter.split_set( vbt.RepFunc(new_split), new_set_labels=["train", "valid", "test"] )
In [ ]:
splitter.stats()
In [ ]:
splitter.plots().show_svg()
Homework¶
In [ ]:
splitter = splitter.merge_sets(columns=["valid", "test"], new_set_label="test") splitter.plots().show_svg()
Applications¶
Taking¶
Without stacking¶
In [ ]:
close_slices = splitter.take(data.close) close_slices
In [ ]:
close_slices[2020, "test"]
In [ ]:
def get_total_return(sr): return sr.vbt.to_returns().vbt.returns.total() close_slices.apply(get_total_return)
Complex objects¶
In [ ]:
trendlb = data.run("trendlb", 1.0, 0.5) trendlb.plot().show_svg()
In [ ]:
grouper = pd.Index(trendlb.labels.map({1: "U", 0: "D"}), name="trend") trend_splitter = vbt.Splitter.from_grouper(data.index, grouper) trend_splitter.plot().show_svg()
In [ ]:
hold_pf = vbt.Portfolio.from_holding(data) hold_returns_acc = hold_pf.returns_acc fast_sma, slow_sma = vbt.talib("SMA").run_combs( data.close, np.arange(5, 50), short_names=["fast_sma", "slow_sma"]) entries = fast_sma.real_crossed_above(slow_sma) exits = fast_sma.real_crossed_below(slow_sma) strat_pf = vbt.Portfolio.from_signals( data, entries, exits, direction="both") strat_returns_acc = strat_pf.returns_acc
In [ ]:
hold_returns_acc_slices = trend_splitter.take(hold_returns_acc) strat_returns_acc_slices = trend_splitter.take(strat_returns_acc)
In [ ]:
hold_returns_acc_slices["U"].sharpe_ratio()
In [ ]:
strat_returns_acc_slices["U"].sharpe_ratio().vbt.heatmap( x_level="fast_sma_timeperiod", y_level="slow_sma_timeperiod", symmetric=True ).show_svg()
In [ ]:
hold_returns_acc_slices["D"].sharpe_ratio()
In [ ]:
strat_returns_acc_slices["D"].sharpe_ratio().vbt.heatmap( x_level="fast_sma_timeperiod", y_level="slow_sma_timeperiod", symmetric=True ).show_svg()
In [ ]:
trend_splitter = trend_splitter.break_up_splits("by_gap", sort=True) trend_splitter.plot().show_svg()
In [ ]:
strat_pf_slices = strat_pf.split(trend_splitter) strat_pf_slices
In [ ]:
trend_range_perf = strat_pf_slices.apply(lambda pf: pf.sharpe_ratio) median_trend_perf = trend_range_perf.median(axis=1) median_trend_perf
In [ ]:
trend_perf_ts = data.symbol_wrapper.fill().rename("trend_perf") for label, sr in trend_splitter.bounds.iterrows(): trend_perf_ts.iloc[sr["start"]:sr["end"]] = median_trend_perf[label] data.close.vbt.overlay_with_heatmap(trend_perf_ts).show_svg()
Column stacking¶
In [ ]:
close_stacked = pd.concat( close_slices.values.tolist(), axis=1, keys=close_slices.index ) print(close_stacked)
In [ ]:
get_total_return(close_stacked)
In [ ]:
close_stacked = splitter.take(data.close, into="stacked") close_stacked.shape
In [ ]:
close_stacked = splitter.take(data.close, into="reset_stacked") print(close_stacked)
In [ ]:
close_stacked = splitter.take(data.close, into="from_end_stacked") print(close_stacked)
In [ ]:
close_stacked = splitter.take(data.close, into="reset_stacked_by_set") close_stacked
In [ ]:
print(close_stacked["train"])
In [ ]:
print(182 * 8) print(1461 * 8) print(1 - 1456 / 11688)
In [ ]:
index_slices = splitter.take(data.index) index_slices
In [ ]:
close_stacked_wb = splitter.take( data.close, into="reset_stacked_by_set", attach_bounds="index", right_inclusive=True ) print(close_stacked_wb["train"])
In [ ]:
@vbt.parameterized(merge_func="concat") def set_sma_crossover_perf(close, fast_window, slow_window, freq): fast_sma = vbt.talib("sma").run( close, fast_window, short_name="fast_sma", hide_params=True) slow_sma = vbt.talib("sma").run( close, slow_window, short_name="slow_sma", hide_params=True) entries = fast_sma.real_crossed_above(slow_sma) exits = fast_sma.real_crossed_below(slow_sma) pf = vbt.Portfolio.from_signals( close, entries, exits, freq=freq, direction="both") return pf.sharpe_ratio
In [ ]:
train_perf = set_sma_crossover_perf( close_stacked["train"], vbt.Param(np.arange(5, 50), condition="x < slow_window"), vbt.Param(np.arange(5, 50)), data.index.freq, _execute_kwargs=dict( clear_cache=50, collect_garbage=50 ) )
In [ ]:
train_perf
In [ ]:
train_perf.vbt.heatmap( x_level="fast_window", y_level="slow_window", slider_level="split_year", symmetric=True ).show_svg() # replace with show()
In [ ]:
@njit def prox_median_nb(arr): if (~np.isnan(arr)).sum() < 20: return np.nan return np.nanmedian(arr) prox_perf_list = [] for split_label, perf_sr in train_perf.groupby("split_year"): perf_df = perf_sr.vbt.unstack_to_df(0, [1, 2]) prox_perf_df = perf_df.vbt.proximity_apply(2, prox_median_nb) prox_perf_sr = prox_perf_df.stack([0, 1]) prox_perf_list.append(prox_perf_sr.reindex(perf_sr.index)) train_prox_perf = pd.concat(prox_perf_list) train_prox_perf
In [ ]:
train_prox_perf.vbt.heatmap( x_level="fast_window", y_level="slow_window", slider_level="split_year", symmetric=True ).show_svg() # replace with show()
In [ ]:
best_params = train_prox_perf.groupby("split_year").idxmax() best_params = train_prox_perf[best_params].index train_prox_perf[best_params]
In [ ]:
test_perf = set_sma_crossover_perf( vbt.RepEval( "test_close.iloc[:, [config_idx]]", context=dict(test_close=close_stacked["test"]) ), vbt.Param(best_params.get_level_values("fast_window"), level=0), vbt.Param(best_params.get_level_values("slow_window"), level=0), data.index.freq ) test_perf
In [ ]:
def get_index_sharpe(index): return data.loc[index].run("from_holding").sharpe_ratio index_slices.xs("test", level="set").apply(get_index_sharpe)
Row stacking¶
In [ ]:
block_size = int(3.15 * len(data.index) ** (1 / 3)) block_splitter = vbt.Splitter.from_rolling( data.index, length=block_size, offset=1, offset_anchor="prev_start" ) block_splitter.n_splits
In [ ]:
size = int(block_splitter.n_splits / block_size) sample_splitter = block_splitter.shuffle_splits(size=size, replace=True) sample_splitter.plot().show_svg()
In [ ]:
returns = data.returns sample_rets = sample_splitter.take(returns, into="stacked", stack_axis=0) sample_rets
In [ ]:
sample_rets.index = data.index[:len(sample_rets)] sample_cumrets = data.close[0] * (sample_rets + 1).cumprod() sample_cumrets.vbt.plot().show_svg()
In [ ]:
samples_rets_list = [] for i in vbt.ProgressBar(range(1000)): sample_spl = block_splitter.shuffle_splits(size=size, replace=True) sample_rets = sample_spl.take(returns, into="stacked", stack_axis=0) sample_rets.index = returns.index[:len(sample_rets)] sample_rets.name = i samples_rets_list.append(sample_rets) sample_rets_stacked = pd.concat(samples_rets_list, axis=1)
In [ ]:
sample_sharpe = sample_rets_stacked.vbt.returns.sharpe_ratio() sample_sharpe.vbt.boxplot(horizontal=True).show_svg()
In [ ]:
sample_sharpe.quantile(0.025), sample_sharpe.quantile(0.975)
Applying¶
In [ ]:
splitter.apply( get_total_return, vbt.Takeable(data.close), merge_func="concat" )
In [ ]:
splitter.apply( get_total_return, vbt.RepFunc(lambda range_: data.close[range_]), merge_func="concat" )
In [ ]:
def get_total_return(range_, data): return data.returns[range_].vbt.returns.total() splitter.apply( get_total_return, vbt.Rep("range_"), data, merge_func="concat" )
In [ ]:
def get_total_return(data): return data.returns.vbt.returns.total() splitter.apply( get_total_return, vbt.Takeable(data), merge_func="concat" )
In [ ]:
splitter.apply( get_total_return, vbt.Takeable(data), set_group_by=True, merge_func="concat" )
In [ ]:
splitter.apply( get_total_return, vbt.Takeable(data), split=[2020, 2021], set_="train", merge_func="concat" )
In [ ]:
train_perf = splitter.apply( sma_crossover_perf, vbt.Takeable(data), vbt.Param(np.arange(5, 50), condition="x < slow_window"), vbt.Param(np.arange(5, 50)), _execute_kwargs=dict( clear_cache=50, collect_garbage=50 ), set_="train", merge_func="concat", )
In [ ]:
train_perf
In [ ]:
best_params = train_perf.groupby("split_year").idxmax() best_params = train_perf[best_params].index train_perf[best_params]
In [ ]:
best_fast_windows = best_params.get_level_values("fast_window") best_slow_windows = best_params.get_level_values("slow_window") test_perf = splitter.apply( sma_crossover_perf, vbt.Takeable(data), vbt.RepFunc(lambda split_idx: best_fast_windows[split_idx]), vbt.RepFunc(lambda split_idx: best_slow_windows[split_idx]), set_="test", merge_func="concat" ) test_perf
Iteration schemes¶
In [ ]:
def cv_sma_crossover( data, fast_windows, slow_windows, split_idx, set_idx, train_perf_list ): if set_idx == 0: train_perf = sma_crossover_perf( data, vbt.Param(fast_windows, condition="x < slow_window"), vbt.Param(slow_windows), _execute_kwargs=dict( clear_cache=50, collect_garbage=50 ) ) train_perf_list.append(train_perf) best_params = train_perf.idxmax() return train_perf[[best_params]] else: train_perf = train_perf_list[split_idx] best_params = train_perf.idxmax() test_perf = sma_crossover_perf( data, vbt.Param([best_params[0]]), vbt.Param([best_params[1]]), ) return test_perf train_perf_list = [] cv_perf = splitter.apply( cv_sma_crossover, vbt.Takeable(data), np.arange(5, 50), np.arange(5, 50), vbt.Rep("split_idx"), vbt.Rep("set_idx"), train_perf_list, iteration="set_major", merge_func="concat", )
In [ ]:
train_perf = pd.concat(train_perf_list, keys=splitter.split_labels) train_perf
In [ ]:
cv_perf
Merging¶
In [ ]:
def get_entries_and_exits(data, fast_window, slow_window): fast_sma = data.run("sma", fast_window, short_name="fast_sma") slow_sma = data.run("sma", slow_window, short_name="slow_sma") entries = fast_sma.real_crossed_above(slow_sma) exits = fast_sma.real_crossed_below(slow_sma) return entries, exits entries, exits = splitter.apply( get_entries_and_exits, vbt.Takeable(data), 20, 30, merge_func="column_stack" ) print(entries)
In [ ]:
entries, exits = splitter.apply( get_entries_and_exits, vbt.Takeable(data), 20, 30, merge_all=False, merge_func="row_stack" ) entries.loc[2018]
In [ ]:
def get_signal_count(*args, **kwargs): entries, exits = get_entries_and_exits(*args, **kwargs) return entries.vbt.signals.total(), exits.vbt.signals.total() entry_count, exit_count = splitter.apply( get_signal_count, vbt.Takeable(data), 20, 30, merge_func="concat", attach_bounds="index" ) entry_count
In [ ]:
def plot_entries_and_exits(results, data, keys): set_labels = keys.get_level_values("set") fig = data.plot(plot_volume=False) train_seen = False test_seen = False for i in range(len(results)): entries, exits = results[i] set_label = set_labels[i] if set_label == "train": entries.vbt.signals.plot_as_entries( data.close, trace_kwargs=dict( marker=dict(color="limegreen"), name=f"Entries ({set_label})", legendgroup=f"Entries ({set_label})", showlegend=not train_seen ), fig=fig ), exits.vbt.signals.plot_as_exits( data.close, trace_kwargs=dict( marker=dict(color="orange"), name=f"Exits ({set_label})", legendgroup=f"Exits ({set_label})", showlegend=not train_seen ), fig=fig ) train_seen = True else: entries.vbt.signals.plot_as_entries( data.close, trace_kwargs=dict( marker=dict(color="skyblue"), name=f"Entries ({set_label})", legendgroup=f"Entries ({set_label})", showlegend=not test_seen ), fig=fig ), exits.vbt.signals.plot_as_exits( data.close, trace_kwargs=dict( marker=dict(color="magenta"), name=f"Exits ({set_label})", legendgroup=f"Entries ({set_label})", showlegend=not test_seen ), fig=fig ) test_seen = True return fig splitter.apply( get_entries_and_exits, vbt.Takeable(data), 20, 30, merge_func=plot_entries_and_exits, merge_kwargs=dict(data=data, keys=vbt.Rep("keys")), ).show_svg()
Decorators¶
In [ ]:
@vbt.split(splitter=splitter) def get_split_total_return(data): return data.returns.vbt.returns.total() get_split_total_return(vbt.Takeable(data))
In [ ]:
def get_total_return(data): return data.returns.vbt.returns.total() get_split_total_return = vbt.split( get_total_return, splitter=splitter ) get_split_total_return(vbt.Takeable(data))
In [ ]:
@vbt.split def get_split_total_return(data): return data.returns.vbt.returns.total() get_split_total_return(vbt.Takeable(data), _splitter=splitter)
In [ ]:
get_split_total_return( vbt.Takeable(data.loc["2020":"2020"]), _splitter="from_rolling", _splitter_kwargs=dict(length="30d") )
In [ ]:
get_total_return_by_month = vbt.split( get_total_return, splitter="from_grouper", splitter_kwargs=dict(by=vbt.RepEval("index.to_period('M')")), takeable_args=["data"] ) get_total_return_by_month(data.loc["2020":"2020"])
In [ ]:
cv_sma_crossover_perf = vbt.split( sma_crossover_perf, splitter="from_single", splitter_kwargs=dict(split=0.6, set_labels=["train", "test"]), takeable_args=["data"], merge_func="concat", )
In [ ]:
train_perf = cv_sma_crossover_perf( data.loc["2020":"2021"], vbt.Param(np.arange(5, 50), condition="x < slow_window"), vbt.Param(np.arange(5, 50)), p_execute_kwargs=dict( clear_cache=50, collect_garbage=50 ), _forward_kwargs_as={ "p_execute_kwargs": "_execute_kwargs" }, _apply_kwargs=dict(set_="train") )
In [ ]:
train_perf
In [ ]:
test_perf = cv_sma_crossover_perf( data.loc["2020":"2021"], train_perf.idxmax()[0], train_perf.idxmax()[1], _apply_kwargs=dict(set_="test") )
In [ ]:
test_perf
In [ ]:
@njit(nogil=True) def sma_crossover_perf_nb(close, fast_window, slow_window, ann_factor): fast_sma = vbt.nb.ma_nb(close, fast_window) slow_sma = vbt.nb.ma_nb(close, slow_window) entries = vbt.nb.crossed_above_nb(fast_sma, slow_sma) exits = vbt.nb.crossed_above_nb(slow_sma, fast_sma) sim_out = vbt.pf_nb.from_signals_nb( target_shape=close.shape, group_lens=np.full(close.shape[1], 1), close=close, long_entries=entries, short_entries=exits, save_returns=True ) return vbt.ret_nb.sharpe_ratio_nb( sim_out.in_outputs.returns, ann_factor )
In [ ]:
sma_crossover_perf_nb(vbt.to_2d_array(data.close), 20, 30, 365)
In [ ]:
cv_sma_crossover_perf = vbt.cv_split( sma_crossover_perf_nb, splitter="from_rolling", splitter_kwargs=dict( length=360, split=0.5, set_labels=["train", "test"] ), takeable_args=["close"], merge_func="concat", parameterized_kwargs=dict( engine="dask", chunk_len="auto", ) ) grid_perf, best_perf = cv_sma_crossover_perf( vbt.to_2d_array(data.close), vbt.Param(np.arange(5, 50), condition="x < slow_window"), vbt.Param(np.arange(5, 50)), pd.Timedelta(days=365) // data.index.freq, _merge_kwargs=dict(wrapper=data.symbol_wrapper), _index=data.index, _return_grid="all" )
In [ ]:
grid_perf
In [ ]:
best_perf
In [ ]:
best_train_perf = best_perf.xs("train", level="set") best_test_perf = best_perf.xs("test", level="set") best_train_perf.corr(best_test_perf)
In [ ]:
param_cross_set_corr = grid_perf\ .unstack("set")\ .groupby(["fast_window", "slow_window"])\ .apply(lambda x: x["train"].corr(x["test"])) param_cross_set_corr.vbt.heatmap(symmetric=True).show_svg()
In [ ]:
grid_test_perf = grid_perf.xs("test", level="set") grid_df = grid_test_perf.rename("grid").reset_index() del grid_df["fast_window"] del grid_df["slow_window"] best_df = best_test_perf.rename("best").reset_index() del best_df["fast_window"] del best_df["slow_window"] merged_df = pd.merge(grid_df, best_df, on=["split", "symbol"]) grid_better_mask = merged_df["grid"] > merged_df["best"] grid_better_mask.index = grid_test_perf.index grid_better_cnt = grid_better_mask.groupby(["split", "symbol"]).mean() grid_better_cnt
In [ ]:
cv_splitter = cv_sma_crossover_perf( _index=data.index, _return_splitter=True ) stacked_close = cv_splitter.take( data.close, into="reset_stacked", set_="test" ) hold_pf = vbt.Portfolio.from_holding(stacked_close, freq="daily") hold_perf = hold_pf.sharpe_ratio hold_perf
Modeling¶
In [ ]:
X = data.run("talib") X.shape
In [ ]:
trendlb = data.run("trendlb", 1.0, 0.5, mode="binary") y = trendlb.labels y.shape
In [ ]:
X = X.replace([-np.inf, np.inf], np.nan) invalid_column_mask = X.isnull().all(axis=0) | (X.nunique() == 1) X = X.loc[:, ~invalid_column_mask] invalid_row_mask = X.isnull().any(axis=1) | y.isnull() X = X.loc[~invalid_row_mask] y = y.loc[~invalid_row_mask] X.shape, y.shape
In [ ]:
from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=42)
In [ ]:
cv = vbt.SplitterCV( "from_expanding", min_length=360, offset=180, split=-180, set_labels=["train", "test"] ) cv_splitter = cv.get_splitter(X) cv_splitter.plot().show_svg()
In [ ]:
from sklearn.model_selection import cross_val_score cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
In [ ]:
X_slices = cv_splitter.take(X) y_slices = cv_splitter.take(y)
In [ ]:
test_labels = [] test_preds = [] for split in X_slices.index.unique(level="split"): X_train_slice = X_slices[(split, "train")] y_train_slice = y_slices[(split, "train")] X_test_slice = X_slices[(split, "test")] y_test_slice = y_slices[(split, "test")] slice_clf = clf.fit(X_train_slice, y_train_slice) test_pred = slice_clf.predict(X_test_slice) test_pred = pd.Series(test_pred, index=y_test_slice.index) test_labels.append(y_test_slice) test_preds.append(test_pred) test_labels = pd.concat(test_labels).rename("labels") test_preds = pd.concat(test_preds).rename("preds")
In [ ]:
data.close.vbt.overlay_with_heatmap(test_labels).show_svg()
In [ ]:
data.close.vbt.overlay_with_heatmap(test_preds).show_svg()
In [ ]:
pf = vbt.Portfolio.from_signals( data.close[test_preds.index], test_preds == 1, test_preds == 0, direction="both" ) pf.stats()
In [ ]: