i think im confusing python and java logic (python while loop error)
Question:
import requests
import csv
import PyPDF2
import regex
import finnhub
import json
from schwab_api import Schwab
russell3000 = "https://research.ftserussell.com/analytics/factsheets/Home/DownloadConstituentsWeights/?indexdetails=US3000"
def gather_index(index):
tickers = []
ciks = []
with open("index.pdf", "wb") as f:
f.write(requests.get(index).content)
pdfReader = PyPDF2.PdfFileReader(open("index.pdf", "rb"))
for x in range(pdfReader.numPages):
pageObj = pdfReader.getPage(x)
for y in regex.findall("(?<=Countryn|United Statesn)(.*?)(?=n[0-9][.][0-9][0-9][0-9])", pageObj.extractText()):
database = requests.get("https://www.sec.gov/files/company_tickers.json").json()
i = 0
while True:
try:
if y==database[i]["title"]:
tickers += database[i]["ticker"]
if(database[i]["cik_str"].length()<10):
cik = database[i]["cik_str"]
for z in range(10-database[i]["cik_str"].length()):
cik = "0"+cik
ciks += cik
else:
ciks += database[i]["cik_str"]
i += 1
except KeyError:
i += 1
continue
return ciks, tickers
if __name__ == "__main__":
print(gather_index(russell3000))
im really not quite sure what goes wrong here
i know that the loop keeps going on and on but my logic seems sound-ish
the goal of this is to save and read the pdf to find a company name then save and search the json line by line for the company and save the ticker and cik to a list
Answers:
You don’t need a while loop at all. You are iterating over a finite amount of data.
You only need to lookup company tickers once. Then you can regex match the PDF and cross-compare
E.g.
# Gather SEC data
company_tickers = requests.get("https://www.sec.gov/files/company_tickers.json").json()
sec_data = dict()
for _, v in company_tickers.items():
sec_data[v['title']] = v # create a reverse index
# Compare against PDF fact sheet
tickers = []
pdfReader = PyPDF2.PdfFileReader(open("index.pdf", "rb"))
for x in range(pdfReader.numPages):
pageObj = pdfReader.getPage(x)
for y in regex.findall("(?<=Countryn|United Statesn)(.*?)(?=n[0-9][.][0-9][0-9][0-9])", pageObj.extractText()):
if y in sec_data: # do a key lookup based on the title
tickers.append(sec_data[y]['ticker'])
print(tickers)
Not sure what cik
represents… but data["cik_str"]
seem to be integers, so do not have a .length
, but seems you want to left-pad with zeros?
>>> aapl = {
... "cik_str": 320193,
... "ticker": "AAPL",
... "title": "Apple Inc."
... }
>>> aapl['cik_str']
320193
>>> str(aapl['cik_str']).zfill(10)
'0000320193'
import requests
import csv
import PyPDF2
import regex
import finnhub
import json
from schwab_api import Schwab
russell3000 = "https://research.ftserussell.com/analytics/factsheets/Home/DownloadConstituentsWeights/?indexdetails=US3000"
def gather_index(index):
tickers = []
ciks = []
with open("index.pdf", "wb") as f:
f.write(requests.get(index).content)
pdfReader = PyPDF2.PdfFileReader(open("index.pdf", "rb"))
for x in range(pdfReader.numPages):
pageObj = pdfReader.getPage(x)
for y in regex.findall("(?<=Countryn|United Statesn)(.*?)(?=n[0-9][.][0-9][0-9][0-9])", pageObj.extractText()):
database = requests.get("https://www.sec.gov/files/company_tickers.json").json()
i = 0
while True:
try:
if y==database[i]["title"]:
tickers += database[i]["ticker"]
if(database[i]["cik_str"].length()<10):
cik = database[i]["cik_str"]
for z in range(10-database[i]["cik_str"].length()):
cik = "0"+cik
ciks += cik
else:
ciks += database[i]["cik_str"]
i += 1
except KeyError:
i += 1
continue
return ciks, tickers
if __name__ == "__main__":
print(gather_index(russell3000))
im really not quite sure what goes wrong here
i know that the loop keeps going on and on but my logic seems sound-ish
the goal of this is to save and read the pdf to find a company name then save and search the json line by line for the company and save the ticker and cik to a list
You don’t need a while loop at all. You are iterating over a finite amount of data.
You only need to lookup company tickers once. Then you can regex match the PDF and cross-compare
E.g.
# Gather SEC data
company_tickers = requests.get("https://www.sec.gov/files/company_tickers.json").json()
sec_data = dict()
for _, v in company_tickers.items():
sec_data[v['title']] = v # create a reverse index
# Compare against PDF fact sheet
tickers = []
pdfReader = PyPDF2.PdfFileReader(open("index.pdf", "rb"))
for x in range(pdfReader.numPages):
pageObj = pdfReader.getPage(x)
for y in regex.findall("(?<=Countryn|United Statesn)(.*?)(?=n[0-9][.][0-9][0-9][0-9])", pageObj.extractText()):
if y in sec_data: # do a key lookup based on the title
tickers.append(sec_data[y]['ticker'])
print(tickers)
Not sure what cik
represents… but data["cik_str"]
seem to be integers, so do not have a .length
, but seems you want to left-pad with zeros?
>>> aapl = {
... "cik_str": 320193,
... "ticker": "AAPL",
... "title": "Apple Inc."
... }
>>> aapl['cik_str']
320193
>>> str(aapl['cik_str']).zfill(10)
'0000320193'