File size is getting progressively larger as I continue to download data

Question:

I am downloading financial data using to_hdf and I have noticed that each file gets larger and larger as it keeps downloading. What is happening?

The first file was saved as 223 KB and the most recent where I stopped (67) was saved as 14,609 KB.

The following is the code (some sections that are irrelevant have been removed):

import pandas as pd
import datetime as dt
import yfinance as yf
from pandas.tseries.holiday import USFederalHolidayCalendar
import yahoo_fin.stock_info as si
from pathlib import Path
import os.path


def main():
    end = dt.datetime.now()

    start = end + dt.timedelta(days=-5)

    dr = pd.date_range(start=start, end=end)
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    a = dr[~dr.isin(holidays)] # not US holiday
    b = a[a.weekday != 5]
    b = b[b.weekday != 6]

    for year in set(b.year):
        tmp = b[b.year == year]
        for week in set(pd.Index(tmp.isocalendar().week)):
            temp = tmp[pd.Index(tmp.isocalendar().week) == week]
            start = temp[temp.weekday == temp.weekday.min()][0]  # beginning of week
            end = temp[temp.weekday == temp.weekday.max()][0]  # ending of week

    # get list of all index tickers
    ticker_strings = si.tickers_sp500()

    data_dir = 'data'

    x = 1

    tickers_dir = './tickers'

    Index = '^GSPC'

    # initialize list for the following f(x)
    Df_list = list()

    ticker_data(ticker_strings, start, end, Df_list, data_dir, x)

    print("Complete")


def ticker_data(ticker_strings, start, end, Df_list, data_dir, x):
    # find values for individual stocks
    for ticker in ticker_strings:
        loc_start = start
        while loc_start <= end:
            period_end = loc_start + dt.timedelta(days=1)
            intra_day_data = yf.download(ticker, loc_start, period_end, period="1d", interval="1m")
            extra_day_data = yf.download(ticker, loc_start, period_end, period="1d", interval="1m", prepost=True)
            Df_list.append(intra_day_data)
            Df_list.append(extra_day_data)
            loc_start = loc_start + dt.timedelta(days=1)
        df = pd.concat(Df_list)

        # creates file name
        filename = end.strftime('%F') + " " + ticker + ".h5"
        # saves file name to folder
        df.to_hdf(os.path.join(data_dir, filename), mode='w', key='df')
        #df.to_csv(os.path.join(data_dir, filename))

        print(x, ticker)
        x += 1



if __name__ == "__main__":
    main()

Asked By: La Myass

||

Answers:

This could be because it is constantly downloading new data.

Answered By: Szymoon_13

You are appending new data to Df_list at every iteration of for ticker in ticker_strings and you are saving all of that every time. Which means that every file will contain also the previous file’s data.

You should use a variable local to the for ticker in ticker_strings loop instead of using a list passed in as a parameter.

Answered By: pqnet
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.