File size is getting progressively larger as I continue to download data
Question:
I am downloading financial data using to_hdf and I have noticed that each file gets larger and larger as it keeps downloading. What is happening?
The first file was saved as 223 KB and the most recent where I stopped (67) was saved as 14,609 KB.
The following is the code (some sections that are irrelevant have been removed):
import pandas as pd
import datetime as dt
import yfinance as yf
from pandas.tseries.holiday import USFederalHolidayCalendar
import yahoo_fin.stock_info as si
from pathlib import Path
import os.path
def main():
end = dt.datetime.now()
start = end + dt.timedelta(days=-5)
dr = pd.date_range(start=start, end=end)
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())
a = dr[~dr.isin(holidays)] # not US holiday
b = a[a.weekday != 5]
b = b[b.weekday != 6]
for year in set(b.year):
tmp = b[b.year == year]
for week in set(pd.Index(tmp.isocalendar().week)):
temp = tmp[pd.Index(tmp.isocalendar().week) == week]
start = temp[temp.weekday == temp.weekday.min()][0] # beginning of week
end = temp[temp.weekday == temp.weekday.max()][0] # ending of week
# get list of all index tickers
ticker_strings = si.tickers_sp500()
data_dir = 'data'
x = 1
tickers_dir = './tickers'
Index = '^GSPC'
# initialize list for the following f(x)
Df_list = list()
ticker_data(ticker_strings, start, end, Df_list, data_dir, x)
print("Complete")
def ticker_data(ticker_strings, start, end, Df_list, data_dir, x):
# find values for individual stocks
for ticker in ticker_strings:
loc_start = start
while loc_start <= end:
period_end = loc_start + dt.timedelta(days=1)
intra_day_data = yf.download(ticker, loc_start, period_end, period="1d", interval="1m")
extra_day_data = yf.download(ticker, loc_start, period_end, period="1d", interval="1m", prepost=True)
Df_list.append(intra_day_data)
Df_list.append(extra_day_data)
loc_start = loc_start + dt.timedelta(days=1)
df = pd.concat(Df_list)
# creates file name
filename = end.strftime('%F') + " " + ticker + ".h5"
# saves file name to folder
df.to_hdf(os.path.join(data_dir, filename), mode='w', key='df')
#df.to_csv(os.path.join(data_dir, filename))
print(x, ticker)
x += 1
if __name__ == "__main__":
main()
Answers:
This could be because it is constantly downloading new data.
You are appending new data to Df_list
at every iteration of for ticker in ticker_strings
and you are saving all of that every time. Which means that every file will contain also the previous file’s data.
You should use a variable local to the for ticker in ticker_strings
loop instead of using a list passed in as a parameter.
I am downloading financial data using to_hdf and I have noticed that each file gets larger and larger as it keeps downloading. What is happening?
The first file was saved as 223 KB and the most recent where I stopped (67) was saved as 14,609 KB.
The following is the code (some sections that are irrelevant have been removed):
import pandas as pd
import datetime as dt
import yfinance as yf
from pandas.tseries.holiday import USFederalHolidayCalendar
import yahoo_fin.stock_info as si
from pathlib import Path
import os.path
def main():
end = dt.datetime.now()
start = end + dt.timedelta(days=-5)
dr = pd.date_range(start=start, end=end)
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())
a = dr[~dr.isin(holidays)] # not US holiday
b = a[a.weekday != 5]
b = b[b.weekday != 6]
for year in set(b.year):
tmp = b[b.year == year]
for week in set(pd.Index(tmp.isocalendar().week)):
temp = tmp[pd.Index(tmp.isocalendar().week) == week]
start = temp[temp.weekday == temp.weekday.min()][0] # beginning of week
end = temp[temp.weekday == temp.weekday.max()][0] # ending of week
# get list of all index tickers
ticker_strings = si.tickers_sp500()
data_dir = 'data'
x = 1
tickers_dir = './tickers'
Index = '^GSPC'
# initialize list for the following f(x)
Df_list = list()
ticker_data(ticker_strings, start, end, Df_list, data_dir, x)
print("Complete")
def ticker_data(ticker_strings, start, end, Df_list, data_dir, x):
# find values for individual stocks
for ticker in ticker_strings:
loc_start = start
while loc_start <= end:
period_end = loc_start + dt.timedelta(days=1)
intra_day_data = yf.download(ticker, loc_start, period_end, period="1d", interval="1m")
extra_day_data = yf.download(ticker, loc_start, period_end, period="1d", interval="1m", prepost=True)
Df_list.append(intra_day_data)
Df_list.append(extra_day_data)
loc_start = loc_start + dt.timedelta(days=1)
df = pd.concat(Df_list)
# creates file name
filename = end.strftime('%F') + " " + ticker + ".h5"
# saves file name to folder
df.to_hdf(os.path.join(data_dir, filename), mode='w', key='df')
#df.to_csv(os.path.join(data_dir, filename))
print(x, ticker)
x += 1
if __name__ == "__main__":
main()
This could be because it is constantly downloading new data.
You are appending new data to Df_list
at every iteration of for ticker in ticker_strings
and you are saving all of that every time. Which means that every file will contain also the previous file’s data.
You should use a variable local to the for ticker in ticker_strings
loop instead of using a list passed in as a parameter.