strategy-lab/81_EdgarFundamentalData.ipynb at d718ed61bd41f290bcc40430414eb8867e2b579a

Files

David Brazda e3da60c647 daily update

2024-10-21 20:57:56 +02:00

9.0 KiB

Raw Blame History

No description has been provided for this image

This code extracts financial data from the SEC's EDGAR database, specifically targeting Apple's filings. It downloads and processes quarterly and annual financial statement datasets, converts them to a more efficient format (parquet), and constructs a comprehensive dataset of Apple's financials. The code then calculates key financial metrics such as P/E ratios from earnings per share (EPS) and stock price data. Additionally, it prepares the dataset for further analysis or visualization. This is useful for financial analysis, investment research, and academic purposes.

In [ ]:

import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile
from pathlib import Path
from tqdm import tqdm
import pandas as pd

In [ ]:

from openbb import obb

Set the base URL and file path for SEC data

In [ ]:

SEC_URL = "https://www.sec.gov/"
FSN_PATH = "files/dera/data/financial-statement-and-notes-data-sets/"
DATA_PATH = Path("edgar")
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"

Generate a list of filing periods (quarters) to download

In [ ]:

filing_periods = [
    (d.year, d.quarter) for d in pd.date_range("2015", "2015-12-31", freq="QE")
]

Loop through each filing period to download and extract data

In [ ]:

for yr, qtr in tqdm(filing_periods):
    path = DATA_PATH / f"{yr}_{qtr}" / "source"
    if not path.exists():
        path.mkdir(parents=True)
    filing = f"{yr}q{qtr}_notes.zip"
    url = f"{SEC_URL}{FSN_PATH}{filing}"
    response = requests.get(url, headers={"User-Agent": user_agent}).content
    with ZipFile(BytesIO(response)) as zip_file:
        for file in zip_file.namelist():
            local_file = path / file
            if local_file.exists():
                continue
            with local_file.open("wb") as output:
                for line in zip_file.open(file).readlines():
                    output.write(line)

Convert downloaded TSV files to parquet format for efficiency

In [ ]:

for f in tqdm(sorted(list(DATA_PATH.glob("**/*.tsv")))):
    parquet_path = f.parent.parent / "parquet"
    if not parquet_path.exists():
        parquet_path.mkdir(parents=True)
    file_name = f.stem + ".parquet"
    if not (parquet_path / file_name).exists():
        df = pd.read_csv(
            f, sep="\t", encoding="latin1", low_memory=False, on_bad_lines="skip"
        )
        df.to_parquet(parquet_path / file_name)
        f.unlink()

Filter the subset of data related to Apple Inc. for further analysis

In [ ]:

sub = pd.read_parquet(DATA_PATH / '2015_3' / 'parquet' / 'sub.parquet')
name = "APPLE INC"
cik = sub[sub.name == name].T.dropna().squeeze().cik

Aggregate Apple's filings into a single DataFrame

In [ ]:

aapl_subs = pd.DataFrame()
for sub in DATA_PATH.glob("**/sub.parquet"):
    sub = pd.read_parquet(sub)
    aapl_sub = sub[
        (sub.cik.astype(int) == cik) & (sub.form.isin(["10-Q", "10-K"]))
    ]
    aapl_subs = pd.concat([aapl_subs, aapl_sub])

Extract numerical data from the filings and convert to parquet format

In [ ]:

aapl_nums = pd.DataFrame()
for num in DATA_PATH.glob("**/num.parquet"):
    num = pd.read_parquet(num).drop("dimh", axis=1)
    aapl_num = num[num.adsh.isin(aapl_subs.adsh)]
    aapl_nums = pd.concat([aapl_nums, aapl_num])
aapl_nums.ddate = pd.to_datetime(aapl_nums.ddate, format="%Y%m%d")
aapl_nums.to_parquet(DATA_PATH / "aapl_nums.parquet")

Filter EPS data and adjust for stock splits

In [ ]:

eps = aapl_nums[
    (aapl_nums.tag == "EarningsPerShareDiluted") & (aapl_nums.qtrs == 1)
].drop("tag", axis=1)
eps = eps.groupby("adsh").apply(
    lambda x: x.nlargest(n=1, columns=["ddate"]), include_groups=False
)
eps = eps[["ddate", "value"]].set_index("ddate").squeeze().sort_index()
ax = eps.plot.bar()
ax.set_xticklabels(eps.index.to_period("Q"))

Retrieve historical stock price data and calculate P/E ratio

In [ ]:

aapl = (
    obb.equity.price.historical(
        "AAPL", start_date="2014-12-31", end_date=eps.index.max(), provider="yfinance"
    )
    .to_df()
    .resample("D")
    .last()
    .loc["2014":"2015"]
)

In [ ]:

pe = aapl.close.to_frame("price").join(eps.to_frame("eps")).ffill().dropna()
pe["pe_ratio"] = pe.price.div(pe.eps)
ax = pe.plot(subplots=True, figsize=(16, 8), legend=False, lw=0.5)
ax[0].set_title("Adj Close")
ax[1].set_title("Diluted EPS")
ax[2].set_title("Trailing P/E")

Define fields of interest for further financial analysis

In [ ]:

fields = [
    "EarningsPerShareDiluted",
    "PaymentsOfDividendsCommonStock",
    "WeightedAverageNumberOfDilutedSharesOutstanding",
    "OperatingIncomeLoss",
    "NetIncomeLoss",
    "GrossProfit",
]

PyQuant News is where finance practitioners level up with Python for quant finance, algorithmic trading, and market data analysis. Looking to get started? Check out the fastest growing, top-selling course to get started with Python for quant finance. For educational purposes. Not investment advise. Use at your own risk.

9.0 KiB Raw Blame History

9.0 KiB

Raw Blame History