From b974ab7d39a36d4d846fbb4a13b3c48e10e9c5f2 Mon Sep 17 00:00:00 2001 From: David Brazda Date: Thu, 3 Oct 2024 07:37:19 +0200 Subject: [PATCH] Add files via upload --- to_explore/1_principal_components.ipynb | 1012 +++++++++++++++++++++++ 1 file changed, 1012 insertions(+) create mode 100644 to_explore/1_principal_components.ipynb diff --git a/to_explore/1_principal_components.ipynb b/to_explore/1_principal_components.ipynb new file mode 100644 index 0000000..2881ec1 --- /dev/null +++ b/to_explore/1_principal_components.ipynb @@ -0,0 +1,1012 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Principal Component Analysis\n", + "In trading you make money when you know something that no-one else knows. But if you know that that thing will happen with certainty, but only happen once, how do you show that you're not lucky? You have to show that the circumstances are abnormal and that your model is confident under those circumstances.\n", + "\n", + "This video touches on this idea. Firstly, he selects the parameters of an indicator that convey the most unique (uncorrelated) information using prinipal components analysis. He then creates a target variable for the indicator to predict, and fits a model to it.\n", + "\n", + "The key finding is that whilst the model has almost no predictive power in normal cases (when predicting small changes), **in extreme cases (when predicting large changes) the model appears to be predictive.**\n", + "\n", + "Again, **lots of indicators might only be predictive when they predict something extreme.** He's filtering out areas of the model where you can't be sure that it works. When it predicts something extreme, it's got space to be wrong (if it predicts a large move up and it's really only a small move up, you're probably still winning).\n", + "\n", + "#### So...\n", + "- Dimensionality reduction is very useful.\n", + "- Selective prediction is very useful.\n", + "- You can quantify a model's success not by its accuracy, but by the ability to know when it's right.\n", + "- Money is made based on the uniqueness of the information you hold, the certainty of its truth, and the magnitude of its impact.\n", + " - If you know something highly abnormal will happen with high certainty, then you can make a lot of money.\n", + "\n", + "#### Future Ideas\n", + "- Run a similar process on the 101 World Quant Alphas.\n", + "- Switch out the model to something robust to noise and non-linear (e.g. a random forest).\n", + "- Switch out the target to something that better represents trend following / reversal.\n", + " - Label tops and bottoms with directional change events.\n", + " - Accuracy to top/bottom (time units e.g. days)\n", + " - Accuracy to top/bottom (price units e.g. log returns)\n", + " - Calculate expected length of trend from\n", + " - Label variance with Markov variance regimes.\n", + "\n", + "**Author:** shittles\n", + "\n", + "**Created:** 2024-09-26\n", + "\n", + "**Modified:** 2024-10-01\n", + "\n", + "## Sources\n", + "- https://www.youtube.com/watch?v=mdncZ034Q7k\n", + "- https://github.com/neurotrader888/RSI-PCA/tree/main\n", + "\n", + "## Changelog\n", + "- Watched video, crafted theory, and wrote opening paragraph (2h - 2024/09/26)\n", + "- Followed along with video (4h - 2024/09/27)\n", + "- Scikit-learn pipeline (3h - 2024/10/01)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from scipy import linalg as la\n", + "from sklearn.compose import make_column_transformer\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler, RobustScaler\n", + "from sklearn.decomposition import PCA, FactorAnalysis\n", + "from sklearn.linear_model import LinearRegression\n", + "# from sklearn.model_selection import cross_val_score\n", + "from vectorbtpro import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "plt.style.use(\"dark_background\")\n", + "\n", + "vbt.settings.set_theme(\"dark\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## US Stock Market\n", + "\n", + "### Ingestion\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = vbt.YFData.pull(\n", + " [\"^GSPC\"],\n", + " start=\"50 years ago\",\n", + " end=\"today\",\n", + " timeframe=\"daily\",\n", + " tz=\"UTC\",\n", + " missing_columns=\"drop\",\n", + ")\n", + "\n", + "data.stats()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.data[\"^GSPC\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The opens are corrupt...\n", + "data.plot(symbol=\"^GSPC\", yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# but the closes are fine.\n", + "data.data[\"^GSPC\"][\"Close\"].vbt.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleaning\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.get_feature(\"Dividends\").any(axis=1).all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.get_feature(\"Stock Splits\").any(axis=1).all()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "data = data.remove_features([\"Dividends\", \"Stock Splits\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# data = data.transform(lambda df: df.loc[\"April 19th 1982\" < df.index])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# data = data.resample(\"daily\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "sr_open_gspc = data.get_feature(\"Open\")[\"^GSPC\"]\n", + "sr_high_gspc = data.get_feature(\"High\")[\"^GSPC\"]\n", + "sr_low_gspc = data.get_feature(\"Low\")[\"^GSPC\"]\n", + "sr_close_gspc = data.get_feature(\"Close\")[\"^GSPC\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Following Along\n", + "\n", + "### Calculate and Analyse the RSI\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vbt.IF.list_indicators(\"RSI\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vbt.phelp(vbt.RSI.run)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rsi_periods = np.arange(2, 25)\n", + "\n", + "rsi = vbt.RSI.run(sr_close_gspc, rsi_periods)\n", + "\n", + "rsi.rsi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rsi.rsi.hist(bins=100)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(rsi.rsi.corr())\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compute the Principal Components\n", + "Using principal component analysis (eigenvalues and vectors) as in the video.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "df_rsi_means = rsi.rsi.mean()\n", + "\n", + "# Centre the RSI about zero.\n", + "df_rsi_values = rsi.rsi - df_rsi_means\n", + "\n", + "df_rsi_values = df_rsi_values.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_rsi_values = pd.DataFrame(data=StandardScaler().fit_transform(rsi.rsi.dropna()), index=rsi.rsi.dropna().index, columns=rsi.rsi.columns)\n", + "\n", + "df_rsi_values" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the covariance and eigenvectors.\n", + "arr_covariances = np.cov(df_rsi_values, rowvar=False)\n", + "arr_eigenvals, arr_eigenvecs = la.eigh(arr_covariances)\n", + "\n", + "# Sort the eigenvectors by the eigenvalues.\n", + "idx = np.argsort(arr_eigenvals)[::-1]\n", + "\n", + "arr_eigenvecs = arr_eigenvecs[:, idx]\n", + "arr_eigenvals = arr_eigenvals[idx]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_components = 4\n", + "\n", + "df_X = pd.DataFrame()\n", + "\n", + "for j in range(n_components):\n", + " df_X[j] = pd.Series(np.dot(df_rsi_values, arr_eigenvecs[j]), index=df_rsi_values.index)\n", + "\n", + "df_X.columns.name = \"PC\"\n", + "\n", + "df_X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The principal components pay little-to-no attention to the higher periods.\n", + "# This is unsuprising because they are highly correlated, and so don't convey unique information.\n", + "for j in range(n_components):\n", + " pd.Series(arr_eigenvecs[j], index=rsi_periods).plot(label=\"PC\" + str(j + 1))\n", + "\n", + "plt.xlabel(\"RSI Period\")\n", + "plt.ylabel(\"Eigenvector Value\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using single value decomposition as in scikit-learn.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "U, S, Vt = la.svd(df_rsi_values, full_matrices=False)\n", + "\n", + "# df_X = pd.DataFrame(U @ np.diag(S), index=rsi.rsi.dropna().index)\n", + "df_X = pd.DataFrame(U[:, :n_components] * S[:n_components], index=rsi.rsi.dropna().index)\n", + "\n", + "df_X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the right singular vectors (Vt) for each component against the RSI periods\n", + "for j in range(n_components):\n", + " pd.Series(Vt[j], index=rsi_periods).plot(label=\"PC\" + str(j + 1))\n", + "\n", + "plt.xlabel(\"RSI Period\")\n", + "plt.ylabel(\"Right Singular Vector Value (Component Loadings)\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the Model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# The targets are the returns in 5 days time.\n", + "sr_y = data.returns.shift(-5)[\"^GSPC\"]\n", + "\n", + "sr_y = sr_y.dropna()\n", + "\n", + "# Ensure the features and targets have the same index.\n", + "common_index = df_X.index.intersection(sr_y.index)\n", + "\n", + "df_X = df_X.loc[common_index]\n", + "sr_y = sr_y.loc[common_index]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit the model.\n", + "model_coefs = la.lstsq(df_X, sr_y)[0]\n", + "sr_y_hat = pd.Series(np.dot(df_X, model_coefs), index=sr_y.index)\n", + "\n", + "long_threshold = sr_y_hat.quantile(0.99)\n", + "short_threshold = sr_y_hat.quantile(0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The plot's small, so make the marker size smaller.\n", + "plt.scatter(sr_y_hat, sr_y, s=1)\n", + "plt.axhline(0.0, color=\"white\")\n", + "plt.axvline(long_threshold, color=\"green\")\n", + "plt.axvline(short_threshold, color=\"red\")\n", + "\n", + "plt.xlabel(\"Predictions\")\n", + "plt.ylabel(\"Targets\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modifying Code\n", + "From here on out, I'm significantly modifying the source code. I'm using scikit-learn's decomposition and prediction models, and vectorbtpro's indicators and backtesting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use sci-kit learn to easily swap out decomposition models.\n", + "column_transformer = make_column_transformer(\n", + " (StandardScaler(), np.arange(0, len(rsi.rsi.columns))),\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "pd.DataFrame(\n", + " data=column_transformer.fit_transform(rsi.rsi.dropna()),\n", + " index=rsi.rsi.dropna().index,\n", + " # In this case the column order hasn't been transformed.\n", + " # columns=column_transformer.get_feature_names_out(),\n", + " columns=rsi.rsi.columns,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# There's a lot of room here to do more.\n", + "pipeline = make_pipeline(\n", + " column_transformer,\n", + " # PCA(n_components),\n", + " FactorAnalysis(n_components),\n", + ")\n", + "\n", + "df_X = pd.DataFrame(\n", + " data=pipeline.fit_transform(rsi.rsi.dropna()),\n", + " index=rsi.rsi\n", + " .dropna().index,\n", + ")\n", + "\n", + "# The columns are now the principal components.\n", + "df_X.columns.name = \"PC\"\n", + "\n", + "df_X" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# # Show the explained variance ratio of each component.\n", + "# model_pca = pipeline.named_steps[\"pca\"]\n", + "model_fa = pipeline.named_steps[\"factoranalysis\"]\n", + "\n", + "# explained_variance = model_pca.explained_variance_ratio_" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# plt.plot(range(1, len(explained_variance) + 1), explained_variance.cumsum(), marker=\"o\", linestyle=\"--\")\n", + "\n", + "# plt.title(\"Explained Variance Ratio\")\n", + "# plt.xlabel(\"Number of Components\")\n", + "# plt.ylabel(\"Cumulative Explained Variance\")\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Verify the results from before.\n", + "# Plot the eigenvectors for each principal component\n", + "for j in range(n_components):\n", + " # pd.Series(model_pca.components_[j], index=rsi_periods).plot(label=\"PC\" + str(j + 1))\n", + " pd.Series(model_fa.components_[j], index=rsi_periods).plot(label=\"C\" + str(j + 1))\n", + "\n", + "plt.xlabel(\"RSI Period\")\n", + "# plt.ylabel(\"Eigenvector Value\")\n", + "plt.ylabel(\"Latent Vector Value\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# The targets are the returns in 5 days time.\n", + "sr_y = data.returns.shift(-5)[\"^GSPC\"]\n", + "\n", + "sr_y = sr_y.dropna()\n", + "\n", + "# Ensure the features and targets have the same index.\n", + "common_index = df_X.index.intersection(sr_y.index)\n", + "\n", + "df_X = df_X.loc[common_index]\n", + "sr_y = sr_y.loc[common_index]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit the model.\n", + "clf = LinearRegression() \n", + "\n", + "clf.fit(df_X, sr_y)\n", + "\n", + "sr_y_hat_test = pd.Series(clf.predict(df_X), index=sr_y.index)\n", + "\n", + "long_threshold = sr_y_hat.quantile(0.99)\n", + "short_threshold = sr_y_hat.quantile(0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The plot's small, so make the marker size smaller.\n", + "plt.scatter(sr_y_hat, sr_y, s=1)\n", + "plt.axhline(0.0, color=\"white\")\n", + "plt.axvline(long_threshold, color=\"green\")\n", + "plt.axvline(short_threshold, color=\"red\")\n", + "\n", + "plt.xlabel(\"Predictions\")\n", + "plt.ylabel(\"Targets\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the model was better you'd expect a non-constant relationship between the targets and predictions. There are a million different metrics you can use to check this out.\n", + "\n", + "### Backtest\n", + "Honestly, I'm pretty surprised at how not shit this is given that it's only using the RSI and a linear model looking at super simple targets.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "df_rsi = rsi.rsi.dropna()\n", + "\n", + "# The targets are the returns in 5 days time.\n", + "sr_y = np.log(sr_close_gspc).diff(5).shift(-5).dropna()\n", + "\n", + "# Ensure the features and targets have the same index.\n", + "common_index = df_rsi.index.intersection(sr_y.index)\n", + "\n", + "df_rsi = df_rsi.loc[common_index]\n", + "sr_y = sr_y.loc[common_index]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "cv = vbt.SplitterCV(\n", + " \"from_rolling\",\n", + " length=3 * 250,\n", + " offset=-250,\n", + " split=2 * 250,\n", + " set_labels=[\"train\", \"test\"],\n", + ")\n", + "\n", + "cv_splitter = cv.get_splitter(df_rsi)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cv_splitter.plot().show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sr_rsi_slices = cv_splitter.take(df_rsi)\n", + "sr_y_slices = cv_splitter.take(sr_y)\n", + "\n", + "clf = LinearRegression() \n", + "\n", + "list_result_test = []\n", + "\n", + "for split in sr_rsi_slices.index.unique(level=\"split\"):\n", + " df_rsi_train = sr_rsi_slices[(split, \"train\")]\n", + " df_rsi_test = sr_rsi_slices[(split, \"test\")]\n", + "\n", + " df_X_train = pd.DataFrame(\n", + " data=pipeline.fit_transform(df_rsi_train),\n", + " index=df_rsi_train.index,\n", + " )\n", + " df_X_test = pd.DataFrame(\n", + " data=pipeline.fit_transform(df_rsi_test),\n", + " index=df_rsi_test.index,\n", + " )\n", + " df_X_train.columns.name = \"PC\"\n", + " df_X_test.columns.name = \"PC\"\n", + "\n", + " sr_y_train = sr_y_slices[(split, \"train\")]\n", + " sr_y_test = sr_y_slices[(split, \"test\")]\n", + "\n", + " clf.fit(df_X_train, sr_y_train)\n", + "\n", + " sr_y_hat_test = pd.Series(clf.predict(df_X_test), index=sr_y_test.index)\n", + "\n", + " long_threshold = sr_y_hat_test.quantile(0.99)\n", + " short_threshold = sr_y_hat_test.quantile(0.01)\n", + " \n", + " list_result_test.append(pd.DataFrame({\n", + " \"y\": sr_y_test,\n", + " \"y_hat\": sr_y_hat_test,\n", + " \"Long Threshold\": long_threshold,\n", + " \"Short Threshold\": short_threshold,\n", + " }))\n", + "\n", + "df_result_test = pd.concat(list_result_test)\n", + "\n", + "df_result_test\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_result_test.loc[df_result_test[\"y_hat\"] > df_result_test[\"Long Threshold\"], \"Entries\"] = True\n", + "df_result_test.loc[df_result_test[\"y_hat\"] < df_result_test[\"Short Threshold\"], \"Exits\"] = True\n", + "\n", + "df_result_test = df_result_test.fillna(False)\n", + "\n", + "df_result_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_result_test[\"y_hat\"].plot()\n", + "df_result_test[\"Long Threshold\"].plot(color=\"green\")\n", + "df_result_test[\"Short Threshold\"].plot(color=\"red\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "# sr_clean_entries_test, sr_clean_exits_test = sr_entries_test.vbt.signals.clean(sr_exits_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf = vbt.Portfolio.from_signals(\n", + " close=sr_close_gspc,\n", + " entries=df_result_test[\"Entries\"],\n", + " exits=df_result_test[\"Exits\"],\n", + " direction=\"longonly\",\n", + " fees=0.001,\n", + " size=1.0,\n", + " size_type=vbt.pf_enums.SizeType.ValuePercent,\n", + ")\n", + "\n", + "pf.stats()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.drawdowns.plot(yaxis=dict(type=\"log\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf.plot_underwater(pct_scale=True).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Original Code\n", + "For reference.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pandas_ta as ta\n", + "import numpy as np\n", + "from typing import List\n", + "import seaborn as sns\n", + "from scipy import linalg as la\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "def pca_linear_model(x: pd.DataFrame, y: pd.Series, n_components: int, thresh: float= 0.01):\n", + " # Center data at 0\n", + " means = x.mean()\n", + " x -= means\n", + " x = x.dropna()\n", + "\n", + " # Find covariance and compute eigen vectors\n", + " cov = np.cov(x, rowvar=False)\n", + " evals , evecs = la.eigh(cov)\n", + " # Sort eigenvectors by size of eigenvalue\n", + " idx = np.argsort(evals)[::-1]\n", + " evecs = evecs[:,idx]\n", + " evals = evals[idx]\n", + "\n", + " # Create data set for model\n", + " model_data = pd.DataFrame()\n", + " for j in range(n_components):\n", + " model_data['PC' + str(j)] = pd.Series( np.dot(x, evecs[j]) , index=x.index)\n", + " \n", + " cols = list(model_data.columns)\n", + " model_data['target'] = y\n", + " model_coefs = la.lstsq(model_data[cols], y)[0]\n", + " model_data['pred'] = np.dot( model_data[cols], model_coefs)\n", + "\n", + " l_thresh = model_data['pred'].quantile(0.99)\n", + " s_thresh = model_data['pred'].quantile(0.01)\n", + "\n", + " return model_coefs, evecs, means, l_thresh, s_thresh, model_data" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "def pca_rsi_model(\n", + " ohlc: pd.DataFrame, rsi_lbs, train_size: int, step_size: int, \n", + " n_components: int = 2, lookahead: int = 6\n", + "): \n", + " rsis = pd.DataFrame()\n", + " for lb in rsi_lbs:\n", + " rsis[lb] = ta.rsi(ohlc['Close'], lb)\n", + "\n", + " warm_up = max(rsi_lbs) * 2\n", + " next_train = warm_up + train_size\n", + " tar = np.log(ohlc['Close']).diff(lookahead).shift(-lookahead)\n", + "\n", + " # Outputs\n", + " model_pred = np.zeros(len(ohlc))\n", + " long_thresh = np.zeros(len(ohlc))\n", + " short_thresh = np.zeros(len(ohlc))\n", + " signal = np.zeros(len(ohlc))\n", + "\n", + " model_pred[:] = np.nan\n", + " long_thresh[:] = np.nan\n", + " short_thresh[:] = np.nan\n", + "\n", + " rsi_means = None\n", + " evecs = None\n", + " model_coefs = None\n", + " l_thresh = None\n", + " s_thresh = None\n", + " for i in range(next_train, len(ohlc)):\n", + " if i == next_train:\n", + " # Get RSI values in window, prevent future leak\n", + " train_data = rsis.iloc[i - train_size: i + 1 - lookahead].copy()\n", + " y = tar.reindex(train_data.index)\n", + " \n", + " model_coefs, evecs, rsi_means, l_thresh, s_thresh, _ = pca_linear_model(train_data, y, n_components)\n", + " next_train += step_size\n", + " \n", + " curr_row = rsis.iloc[i] - rsi_means\n", + " vec = np.zeros(n_components)\n", + " for j in range(n_components):\n", + " vec[j] = np.dot(curr_row, evecs[j])\n", + " curr_pred = np.dot(vec, model_coefs)\n", + "\n", + " model_pred[i] = curr_pred\n", + " long_thresh[i] = l_thresh \n", + " short_thresh[i] = s_thresh\n", + " if curr_pred > l_thresh:\n", + " signal[i] = 1\n", + " elif curr_pred < s_thresh:\n", + " signal[i] = -1\n", + "\n", + " # Output dataframe\n", + " output_df = pd.DataFrame(index=ohlc.index)\n", + " output_df['pred'] = model_pred\n", + " output_df['long_thresh'] = long_thresh\n", + " output_df['short_thresh'] = short_thresh\n", + " output_df['signal'] = signal\n", + " # Keep signals normalized to -1 1\n", + " output_df['signal'] = output_df['signal'].rolling(lookahead).mean() \n", + " return output_df " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lookahead = 6\n", + "\n", + "df_gspc = data.data[\"^GSPC\"]\n", + "\n", + "output = pca_rsi_model(df_gspc, list(range(2, 25)), 250 * 2, 250, n_components=3, lookahead=lookahead)\n", + "output['t'] = np.log(df_gspc['Close']).diff(lookahead).shift(-lookahead)\n", + "\n", + "print(\"Mean Target Above Long Threshold\", output[output['pred'] > output['long_thresh']]['t'].mean())\n", + "print(\"Mean Target Below Short Threshold\", output[output['pred'] < output['short_thresh']]['t'].mean())\n", + "\n", + "next_r = np.log(df_gspc['Close']).diff().shift(-1)\n", + "df_gspc['strat_ret'] = next_r * output['signal']\n", + "\n", + "# Profit fac\n", + "pf = df_gspc[df_gspc['strat_ret'] > 0]['strat_ret'].sum() / df_gspc[df_gspc['strat_ret'] < 0]['strat_ret'].abs().sum()\n", + "print(\"Profit Factor\",pf)\n", + "\n", + "plt.style.use(\"dark_background\")\n", + "\n", + "df_gspc['r'] = next_r\n", + "\n", + "# for his btcusdt data\n", + "# df_gspc = df_gspc[df_gspc.index > '2020-01-01']\n", + "# output = output[output.index > '2020-01-01']\n", + "\n", + "fig, axs = plt.subplots(2, 1, sharex=True)\n", + "df_gspc['strat_ret'].cumsum().plot(label='RSI-PSA 3-6 Model', ax=axs[0])\n", + "\n", + "output['pred'].plot(ax=axs[1])\n", + "output['long_thresh'].plot(ax=axs[1], color='green')\n", + "output['short_thresh'].plot(ax=axs[1], color='red')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Heatmap code\n", + "next_r = np.log(df_gspc['Close']).diff().shift(-1)\n", + "pf_df = pd.DataFrame()\n", + "for lookahead in list(range(1, 25)):\n", + " for n_components in [1,2,3,4,5,6]:\n", + " output = pca_rsi_model(df_gspc, list(range(2, 25)), 250 * 2, 250, n_components=n_components, lookahead=lookahead)\n", + " output['t'] = np.log(df_gspc['Close']).diff(lookahead).shift(-lookahead)\n", + " df_gspc['strat_ret'] = next_r * output['signal']\n", + "\n", + " # Profit fac\n", + " pf = df_gspc[df_gspc['strat_ret'] > 0]['strat_ret'].sum() / df_gspc[df_gspc['strat_ret'] < 0]['strat_ret'].abs().sum()\n", + " print(pf)\n", + "\n", + " pf_df.loc[lookahead, n_components] = pf\n", + "\n", + "plt.style.use(\"dark_background\")\n", + "sns.heatmap(pf_df, annot=True)\n", + "plt.xlabel(\"N Components\")\n", + "plt.ylabel(\"Look Ahead\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}