11 KiB
This code downloads and processes stock price data, trains an autoencoder to compress and reconstruct the data, and then uses the learned embeddings for clustering and visualization. First, it fetches historical stock price data from Yahoo Finance. It then computes log returns, moving averages, and volatility to form features. A PyTorch autoencoder is trained on these features to learn compressed representations (embeddings). Finally, it uses K-Means clustering on the embeddings and visualizes the clusters using PCA.
import yfinance as yf import pandas as pd import numpy as np import torch import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset from sklearn.cluster import KMeans from sklearn.decomposition import PCA import matplotlib.pyplot as plt import seaborn as sns import warnings
warnings.filterwarnings("ignore")
Define a list of stock symbols to fetch data for
symbols = [ "AAPL", "MSFT", "GOOGL", "AMZN", "META", "TSLA", "BRK-B", "V", "JNJ", "WMT", "JPM", "MA", "PG", "UNH", "DIS", "NVDA", "HD", "PYPL", "BAC", "VZ", "ADBE", "CMCSA", "NFLX", "KO", "NKE", "MRK", "PEP", "T", "PFE", "INTC", ]
Download adjusted close prices for the specified symbols from Yahoo Finance
stock_data = yf.download( symbols, start="2020-01-01", end="2023-12-31" )["Adj Close"]
Calculate log returns, moving averages, and volatility for the stock data
log_returns = np.log(stock_data / stock_data.shift(1)) moving_avg = stock_data.rolling(window=22).mean() volatility = stock_data.rolling(window=22).std()
Concatenate the features and standardize them
features = pd.concat([log_returns, moving_avg, volatility], axis=1).dropna() processed_data = (features - features.mean()) / features.std()
Convert the features into PyTorch tensors
tensor = torch.tensor(processed_data.values, dtype=torch.float32) dataset = TensorDataset(tensor) data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
Define an autoencoder neural network for stock data embedding
class StockAutoencoder(nn.Module): """Autoencoder neural network for stock data embedding This class defines an autoencoder with an encoder and decoder to compress and reconstruct stock data. Parameters ---------- feature_dim : int The dimensionality of the input features """ def __init__(self, feature_dim): super(StockAutoencoder, self).__init__() self.encoder = nn.Sequential( nn.Linear(feature_dim, 64), nn.ReLU(), nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 10), # Latent space ) self.decoder = nn.Sequential( nn.Linear(10, 32), nn.ReLU(), nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, feature_dim), nn.ReLU(), ) def forward(self, x): x = self.encoder(x) x = self.decoder(x) return x
Train the autoencoder on the stock data using MSE loss and Adam optimizer
def train(model, data_loader, epochs=100): """Train the autoencoder model This function trains the autoencoder using MSE loss and Adam optimizer over a specified number of epochs. Parameters ---------- model : nn.Module The autoencoder model to be trained data_loader : DataLoader DataLoader object to iterate through the dataset epochs : int, optional Number of epochs to train the model (default is 100) """ criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) model.train() for epoch in range(epochs): for data in data_loader: inputs = data[0] optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, inputs) loss.backward() optimizer.step() print(f"Epoch {epoch+1}, Loss: {loss.item()}")
Initialize and train the autoencoder model
feature_dim = processed_data.shape[1] model = StockAutoencoder(feature_dim) train(model, data_loader)
Extract embeddings from the trained autoencoder model
def extract_embeddings(model, data_loader): """Extract embeddings from the trained autoencoder model This function extracts embeddings by passing data through the encoder part of the autoencoder. Parameters ---------- model : nn.Module The trained autoencoder model data_loader : DataLoader DataLoader object to iterate through the dataset Returns ------- embeddings : torch.Tensor Tensor containing the extracted embeddings """ model.eval() embeddings = [] with torch.no_grad(): for data in data_loader: inputs = data[0] encoded = model.encoder(inputs) embeddings.append(encoded) return torch.vstack(embeddings)
Extract embeddings from the trained model
embeddings = extract_embeddings(model, data_loader)
Apply K-Means clustering on the embeddings
kmeans = KMeans(n_clusters=5, random_state=42).fit(embeddings.numpy()) clusters = kmeans.labels_
Reduce the dimensionality of embeddings using PCA for visualization
pca = PCA(n_components=2) embeddings_2d = pca.fit_transform(embeddings.numpy())
Plot the clusters in 2D space using PCA components
sns.scatterplot( x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=clusters, palette=sns.color_palette("hsv", len(set(clusters))), ) plt.xlabel("PCA Dimension 1") plt.ylabel("PCA Dimension 2") plt.legend(title="Cluster") plt.grid(True) plt.show()
PyQuant News is where finance practitioners level up with Python for quant finance, algorithmic trading, and market data analysis. Looking to get started? Check out the fastest growing, top-selling course to get started with Python for quant finance. For educational purposes. Not investment advise. Use at your own risk.
