Python Limit time to run pandas read_html
Question:
I am trying to limit the time for running dfs = pd.read_html(str(response.text)). Once it runs for more than 5 seconds, it will stop running for this url and move to running the next url. I did not find out timeout attribute in pd.readhtml. So how can I do that?
from bs4 import BeautifulSoup
import re
import requests
import os
import time
from pandas import DataFrame
import pandas as pd
from urllib.request import urlopen
headers = {'User-Agent': '[email protected]'}
urls={'https://www.sec.gov/Archives/edgar/data/1058307/0001493152-21-003451.txt', 'https://www.sec.gov/Archives/edgar/data/1064722/0001760319-21-000006.txt'}
for url in urls:
response = requests.get(url, headers = headers)
response.raise_for_status()
time.sleep(0.1)
dfs = pd.read_html(str(response.text))
print(url)
for item in dfs:
try:
Operation=(item[0].apply(str).str.contains('Revenue') | item[0].apply(str).str.contains('profit'))
if Operation.empty:
pass
if Operation.any():
Operation_sheet=item
if not Operation.any():
CashFlows=(item[0].apply(str).str.contains('income') | item[0].apply(str).str.contains('loss'))
if CashFlows.any():
Operation_sheet=item
if not CashFlows.any():
pass
Answers:
I’m not certain what the issue is, but pandas seems to get overwhelmed by this file. If we utilize BeautifulSoup
to instead search for tables, prettify them, and pass those to pd.read_html()
, then it seems to be able to handle things just fine.
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': '[email protected]'}
url = 'https://www.sec.gov/Archives/edgar/data/1064722/0001760319-21-000006.txt'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text)
dfs = []
for table in soup.find_all('table'):
dfs.extend(pd.read_html(table.prettify()))
# Printing the first few:
for df in dfs[0:3]:
print(df, 'n')
0 1 2 3 4
0 Nevada NaN 4813 NaN 65-0783722
1 (State or other jurisdiction of NaN (Primary Standard Industrial NaN (I.R.S. Employer
2 incorporation or organization) NaN Classification Code Number) NaN Identification Number)
0
0 Ralph V. De Martino, Esq.
1 Alec Orudjev, Esq.
2 Schiff Hardin LLP
3 901 K Street, NW, Suite 700
4 Washington, DC 20001
5 Phone (202) 778-6400
6 Fax: (202) 778-6460
0 1
0 Large accelerated filer [ ] Accelerated filer [ ]
1 NaN NaN
2 Non-accelerated filer [X] Smaller reporting company [X]
3 NaN NaN
4 NaN Emerging growth company [ ]
I am trying to limit the time for running dfs = pd.read_html(str(response.text)). Once it runs for more than 5 seconds, it will stop running for this url and move to running the next url. I did not find out timeout attribute in pd.readhtml. So how can I do that?
from bs4 import BeautifulSoup
import re
import requests
import os
import time
from pandas import DataFrame
import pandas as pd
from urllib.request import urlopen
headers = {'User-Agent': '[email protected]'}
urls={'https://www.sec.gov/Archives/edgar/data/1058307/0001493152-21-003451.txt', 'https://www.sec.gov/Archives/edgar/data/1064722/0001760319-21-000006.txt'}
for url in urls:
response = requests.get(url, headers = headers)
response.raise_for_status()
time.sleep(0.1)
dfs = pd.read_html(str(response.text))
print(url)
for item in dfs:
try:
Operation=(item[0].apply(str).str.contains('Revenue') | item[0].apply(str).str.contains('profit'))
if Operation.empty:
pass
if Operation.any():
Operation_sheet=item
if not Operation.any():
CashFlows=(item[0].apply(str).str.contains('income') | item[0].apply(str).str.contains('loss'))
if CashFlows.any():
Operation_sheet=item
if not CashFlows.any():
pass
I’m not certain what the issue is, but pandas seems to get overwhelmed by this file. If we utilize BeautifulSoup
to instead search for tables, prettify them, and pass those to pd.read_html()
, then it seems to be able to handle things just fine.
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': '[email protected]'}
url = 'https://www.sec.gov/Archives/edgar/data/1064722/0001760319-21-000006.txt'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text)
dfs = []
for table in soup.find_all('table'):
dfs.extend(pd.read_html(table.prettify()))
# Printing the first few:
for df in dfs[0:3]:
print(df, 'n')
0 1 2 3 4
0 Nevada NaN 4813 NaN 65-0783722
1 (State or other jurisdiction of NaN (Primary Standard Industrial NaN (I.R.S. Employer
2 incorporation or organization) NaN Classification Code Number) NaN Identification Number)
0
0 Ralph V. De Martino, Esq.
1 Alec Orudjev, Esq.
2 Schiff Hardin LLP
3 901 K Street, NW, Suite 700
4 Washington, DC 20001
5 Phone (202) 778-6400
6 Fax: (202) 778-6460
0 1
0 Large accelerated filer [ ] Accelerated filer [ ]
1 NaN NaN
2 Non-accelerated filer [X] Smaller reporting company [X]
3 NaN NaN
4 NaN Emerging growth company [ ]