From 978cd7e2bedaad36d46c064a730b1b6d5e344980 Mon Sep 17 00:00:00 2001 From: David Brazda Date: Thu, 3 Oct 2024 07:45:25 +0200 Subject: [PATCH] Add files via upload --- to_explore/markov_variance_switching.ipynb | 874 +++++++++++++++++++++ 1 file changed, 874 insertions(+) create mode 100644 to_explore/markov_variance_switching.ipynb diff --git a/to_explore/markov_variance_switching.ipynb b/to_explore/markov_variance_switching.ipynb new file mode 100644 index 0000000..e508381 --- /dev/null +++ b/to_explore/markov_variance_switching.ipynb @@ -0,0 +1,874 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Markov Variance Switching\n", + "\n", + "Kim, C., Nelson, C., and Startz, R. (1998). Testing for mean reversion in heteroskedastic data based on Gibbs-sampling-augmented randomization. Journal of Empirical Finance, (5)2, pp.131-154.\n", + "\n", + "**Author:** shittles\n", + "\n", + "**Created:** 2024-09-18\n", + "\n", + "**Modified:** 2024-09-19\n", + "\n", + "## Changelog\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ray\n", + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.express as px\n", + "import plotly.graph_objects as go\n", + "import statsmodels.api as sm\n", + "\n", + "from sklearn.compose import make_column_transformer\n", + "from sklearn.preprocessing import RobustScaler\n", + "\n", + "from vectorbtpro import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ray.init()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "vbt.settings.set_theme(\"dark\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ingestion\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "symbol = \"^GSPC\" # the S&P 500 ticker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = vbt.YFData.pull(\n", + " symbol, start=\"50 years ago\", end=\"today\", timeframe=\"daily\", tz=\"UTC\"\n", + ") # 50 years of data\n", + "\n", + "data.stats()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.data[symbol]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.data[symbol].index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The opens are corrupt...\n", + "data.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# but the closes are fine.\n", + "data.data[\"^GSPC\"][\"Close\"].vbt.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.data[symbol][\"Dividends\"].any()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.data[symbol][\"Stock Splits\"].any()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "data = data.remove_features([\"Dividends\", \"Stock Splits\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# data = data.transform(lambda df: df.loc[\"April 19th 1982\" < df.index])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# data.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# len(data.index) / 365.25 # 30 years of data remaining" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modelling\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# sr_open = data.get(\"Open\")\n", + "# sr_high = data.get(\"High\")\n", + "# sr_low = data.get(\"Low\")\n", + "sr_close = data.get(\"Close\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# sr_log_open = np.log(sr_open)\n", + "# sr_log_high = np.log(sr_high)\n", + "# sr_log_low = np.log(sr_low)\n", + "sr_log_close = np.log(sr_close)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "sr_log_returns = data.log_returns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sr_log_returns.vbt.plot().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "column_transformer = make_column_transformer(\n", + " (RobustScaler(), [symbol]),\n", + ")\n", + "\n", + "sr_log_returns_scaled = pd.Series(\n", + " data=column_transformer.fit_transform(pd.DataFrame(sr_log_returns)).ravel(),\n", + " index=sr_log_returns.index,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sr_log_returns_scaled.vbt.plot().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "k_regimes_kns = 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kns = sm.tsa.MarkovRegression(\n", + " sr_log_returns_scaled, k_regimes=k_regimes_kns, trend=\"n\", switching_variance=True\n", + ")\n", + "results_kns = kns.fit()\n", + "\n", + "results_kns.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_kns.filtered_marginal_probabilities # using data until time t (excluding time t+1, ..., T)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_kns.smoothed_marginal_probabilities # using data until time T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = vbt.make_subplots(\n", + " rows=k_regimes_kns,\n", + " cols=1,\n", + " y_title=\"Smoothed Marginal Variance Regime Probabilities\",\n", + " shared_xaxes=True,\n", + " subplot_titles=[\n", + " \"Medium-variance\",\n", + " \"Low-variance\",\n", + " \"High-variance\",\n", + " ], # order changes dependent on fit\n", + ")\n", + "\n", + "for i in range(k_regimes_kns):\n", + " fig = results_kns.smoothed_marginal_probabilities[i].vbt.plot(\n", + " add_trace_kwargs=dict(row=i + 1, col=1), fig=fig\n", + " )\n", + "\n", + "\n", + "fig.update_layout(showlegend=False)\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_annotated_line(\n", + " fig: go.Figure,\n", + " x: pd.Series,\n", + " y: pd.Series,\n", + " classes: pd.Series,\n", + " dict_class_colours: dict,\n", + " dict_class_labels: dict,\n", + ") -> go.Figure:\n", + " \"\"\"Plot a line chart where each trace is coloured based on its class.\n", + "\n", + " Yes, plotly really doesn't support this out of the box.\n", + "\n", + " Args:\n", + " fig: Figure.\n", + " x: Indices.\n", + " y: Close prices.\n", + " classes: Regimes.\n", + " dict_class_colours: In the format {class: colour}\n", + " dict_class_labels: In the format {class: label}\n", + "\n", + " Returns:\n", + " fig: The figure.\n", + " \"\"\"\n", + " # Plot each segment in its corresponding color.\n", + " for i in range(len(x) - 1):\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=x[i : i + 2],\n", + " y=y[i : i + 2],\n", + " mode=\"lines\",\n", + " line=dict(color=dict_class_colours[classes[i]], width=2),\n", + " showlegend=False, # added manually\n", + " )\n", + " )\n", + "\n", + " # Label each regime.\n", + " for regime, colour in dict_class_colours.items():\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=[None],\n", + " y=[None],\n", + " mode=\"lines\",\n", + " line=dict(color=colour, width=2),\n", + " name=dict_class_labels[regime],\n", + " )\n", + " )\n", + "\n", + " return fig" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "sr_variance_regime_forecasts = results_kns.filtered_marginal_probabilities.idxmax(\n", + " axis=1\n", + ")\n", + "\n", + "sr_variance_regime_predictions = results_kns.smoothed_marginal_probabilities.idxmax(\n", + " axis=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# sr_variance_regime_forecasts.vbt.plot().show()\n", + "sr_variance_regime_predictions.vbt.plot().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# order changes dependent on fit\n", + "dict_variance_regime_labels = {\n", + " 0: \"Medium\",\n", + " 1: \"Low\",\n", + " 2: \"High\",\n", + "}\n", + "\n", + "dict_variance_regime_colours = {\n", + " 0: \"orange\",\n", + " 1: \"green\",\n", + " 2: \"red\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = vbt.make_figure()\n", + "\n", + "fig = plot_annotated_line(\n", + " fig,\n", + " data.index,\n", + " sr_log_close,\n", + " # sr_variance_regime_forecasts.rolling(5).mean().fillna(0).round(0),\n", + " sr_variance_regime_forecasts,\n", + " # sr_variance_regime_predictions,\n", + " dict_variance_regime_colours,\n", + " dict_variance_regime_labels,\n", + ")\n", + "\n", + "fig.update_layout(\n", + " title=\"Filtered Variance Regime Labels\",\n", + " # title=\"Smoothed Variance Regime Labels\",\n", + " xaxis_title=\"Date\",\n", + " yaxis_title=\"Log Close\",\n", + " showlegend=True,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Backtest\n", + "### Filtered marginal probabilities\n", + "A backtest using filtered marginal probabilities.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO - Double check me!\n", + "# Assuming that you sell today if yesterday was in the high-variance regime.\n", + "# entries = (sr_variance_regime_forecasts != 2).vbt.signals.fshift()\n", + "# exits = (sr_variance_regime_forecasts == 2).vbt.signals.fshift()\n", + "\n", + "# Assuming that you sell today (at the close) if today was in the high-variance regime.\n", + "entries = (sr_variance_regime_forecasts != 2)\n", + "exits = (sr_variance_regime_forecasts == 2)\n", + "\n", + "# I haven't tested any additional logic.\n", + "# entries = (sr_variance_regime_forecasts.rolling(5).mean().fillna(0).round(0) != 2)\n", + "\n", + "clean_entries, clean_exits = entries.vbt.signals.clean(exits)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = sr_variance_regime_forecasts.vbt.plot()\n", + "\n", + "clean_entries.vbt.signals.plot_as_entries(sr_variance_regime_forecasts, fig=fig)\n", + "clean_exits.vbt.signals.plot_as_exits(sr_variance_regime_forecasts, fig=fig)\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf = vbt.Portfolio.from_signals(\n", + " close=sr_close,\n", + " entries=clean_entries,\n", + " exits=clean_exits,\n", + " direction=\"both\",\n", + " fees=0.001,\n", + " size=1.0,\n", + " size_type=vbt.pf_enums.SizeType.ValuePercent,\n", + ")\n", + "\n", + "pf.stats()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.drawdowns.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.plot_underwater(pct_scale=True).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Re-fitting Every Day\n", + "A backtest with a single training and validation set, that's refit every day" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "n_days_validation = int(365.25 * 2) # 2 years of data held back for the validation set" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# slices_sr = vbt.Splitter.split_range(slice(None), new_split=-n_days_validation, index=data.index)\n", + "splitter_fr = vbt.Splitter.from_rolling(\n", + " data.index, length=len(data.index), split=len(data.index) - n_days_validation\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "splitter_fr.plot().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# Define a Ray remote function for parallelization.\n", + "@ray.remote\n", + "def compute_smoothed_marginal_probabilities(sr):\n", + " kns = sm.tsa.MarkovRegression(\n", + " sr,\n", + " k_regimes=k_regimes_kns,\n", + " trend=\"n\",\n", + " switching_variance=True,\n", + " )\n", + " results_kns = kns.fit()\n", + "\n", + " # the smoothing might not work properly out of sample\n", + " return results_kns.smoothed_marginal_probabilities.iloc[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# The predict method doesn't support out of sample forecasts...\n", + "# kns = sm.tsa.MarkovRegression(\n", + "# sr_log_returns[splitter_fr.get_mask()[\"set_0\"]],\n", + "# k_regimes=k_regimes_kns,\n", + "# trend=\"n\",\n", + "# switching_variance=True,\n", + "# )\n", + "# results_kns = kns.fit()\n", + "\n", + "# results_kns.summary()\n", + "\n", + "# results_kns.predict(sr_log_returns[splitter_fr.get_mask()[\"set_1\"]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://github.com/statsmodels/statsmodels/issues/7982" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "sr_train = sr_log_returns[splitter_fr.get_mask()[\"set_0\"]]\n", + "sr_validate = sr_log_returns[splitter_fr.get_mask()[\"set_1\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ...so re-fit the model every timestep.\n", + "futures = []\n", + "\n", + "sr_log_returns_to_date = sr_train.copy()\n", + "\n", + "# launch parallel tasks\n", + "for i in range(len(sr_validate)):\n", + " sr_log_returns_to_date = pd.concat(\n", + " [\n", + " sr_log_returns_to_date,\n", + " pd.Series(sr_validate.iloc[i], index=[sr_validate.index[i]]),\n", + " ]\n", + " )\n", + "\n", + " futures.append(\n", + " compute_smoothed_marginal_probabilities.remote(sr_log_returns_to_date)\n", + " )\n", + "\n", + "# collect results\n", + "smoothed_marginal_probabilities = ray.get(futures)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "smoothed_marginal_probabilities = pd.concat(smoothed_marginal_probabilities, axis=1).T\n", + "\n", + "smoothed_marginal_probabilities.index.name = \"Date\"" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "sr_variance_regime_predictions = smoothed_marginal_probabilities.idxmax(\n", + " axis=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sr_close_validate = sr_close[splitter_fr.get_mask()[\"set_1\"]]\n", + "\n", + "sr_log_close_validate = sr_log_close[splitter_fr.get_mask()[\"set_1\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = vbt.make_figure()\n", + "\n", + "fig = plot_annotated_line(\n", + " fig,\n", + " data.index[splitter_fr.get_mask()[\"set_1\"]],\n", + " sr_log_close_validate,\n", + " sr_variance_regime_predictions,\n", + " dict_variance_regime_colours,\n", + " dict_variance_regime_labels,\n", + ")\n", + "\n", + "fig.update_layout(\n", + " title=\"Variance Regime Forecasts\",\n", + " xaxis_title=\"Date\",\n", + " yaxis_title=\"Log Close\",\n", + " showlegend=True,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO - Double check me!\n", + "# Assuming that you sell today if yesterday was in the high-variance regime.\n", + "# entries = (sr_variance_regime_forecasts != 2).vbt.signals.fshift()\n", + "# exits = (sr_variance_regime_forecasts == 2).vbt.signals.fshift()\n", + "\n", + "# Assuming that you sell today (at the close) if today was in the high-variance regime.\n", + "entries = (sr_variance_regime_forecasts != 2)\n", + "exits = (sr_variance_regime_forecasts == 2)\n", + "\n", + "# I haven't tested any additional logic.\n", + "# entries = (sr_variance_regime_forecasts.rolling(5).mean().fillna(0).round(0) != 2)\n", + "\n", + "clean_entries, clean_exits = entries.vbt.signals.clean(exits)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = sr_variance_regime_forecasts.vbt.plot()\n", + "\n", + "clean_entries.vbt.signals.plot_as_entries(sr_variance_regime_forecasts, fig=fig)\n", + "clean_exits.vbt.signals.plot_as_exits(sr_variance_regime_forecasts, fig=fig)\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "pf = vbt.Portfolio.from_signals(\n", + " close=sr_close_validate,\n", + " entries=clean_entries,\n", + " exits=clean_exits,\n", + " direction=\"both\",\n", + " fees=0.001,\n", + " size=1.0,\n", + " size_type=vbt.pf_enums.SizeType.ValuePercent,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.stats()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.drawdowns.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.plot_underwater(pct_scale=True).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Observations\n", + "- For this implementation you have to use the filtered probabilities to not introduce look-ahead bias.\n", + "- If you backtest the smoothed probilities (which smoothes using all of the data) it only performs well after the great financial crisis. Why? No clue.\n", + "- It looks like its better for labelling than it is as a strategy in its current state.\n", + "- After slow recessions like the dot-com bubble, there can be a medium-variance decline which this simple strategy doesn't capture.\n", + "- After fast recessions like covid-19, there can be a high-variance rebound which this simple strategy doesn't capture.\n", + "- **It looks like you can safely leverage up during low-variance regimes.**\n", + "- Maybe you could combine this strategy with other recession-leading indicators (e.g. manufacturing/services pmi, federal funds rate, 10y-2y yield curve, 1y-3mo yield curve) to help time the tops?\n", + "- Maybe you could combine this strategy with another trend-following strategy (e.g. VWAP, EMA, BBANDS, ADX) to help time the bottoms?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "ray.shutdown()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}