Python Pandas read_html multi_index table?
Question:
I am not sure if it should be called multi index. Here is the page I am trying to get data from:
Azure product availability by region.
There is hierarchy level: class "category-row" –> "service-row" –> "capability-row" .
pandas.read_html give me a flat table, with all values from three classes. Is there a way to get the hierarchy data?
Here is the code
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import pandas as pd
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.implicitly_wait(30)
url = url = 'https://azure.microsoft.com/en-us/explore/global-infrastructure/products-by-region/?regions=us-east-2,canada-central,canada-east&products=all'
driver.get(url)
tree = BeautifulSoup(driver.find_element_by_id("primary-table").get_attribute('outerHTML'), "html5lib")
table = tree.find('table', class_='primary-table')
header_list = table.find('tr', {'class': 'region-headers-row'}).find_all('th')
df = pd.read_html(driver.find_element_by_id("primary-table").get_attribute('outerHTML'), header=0)[0].iloc[:, :len(header_list)]``
Answers:
Not sure, if it fit your needs, but it is also take the table contents – May provide an expected result.
Example
...
data=[]
soup = BeautifulSoup(driver.page_source)
for r in soup.select('table tr.service-row:has([data-region-slug])'):
row = [
r.find_previous('tr', attrs={'class':'category-row'}).th.get_text(strip=True),
r.th.get_text(strip=True)
]
for c in r.select('td'):
if c.img:
row.append(c.img.get('src'))
else:
row.append(c.span.text)
data.append(row)
df = pd.DataFrame(data, columns=['Category']+list(soup.table.stripped_strings))
df.columns = pd.MultiIndex.from_tuples(
list(
zip(
['','']+[c.get('data-colgroup') for c in soup.table.select('th[data-colgroup]')],
df.columns)
)
)
df
mapper = {'//azurecomcdn.azureedge.net/cvt-5983f2707de6e50e5020c6059b619845bc5be5434c362ed8e18652d58e15571e/images/page/explore/global-infrastructure/products-by-region/ga.svg':'hook',
'//azurecomcdn.azureedge.net/cvt-5983f2707de6e50e5020c6059b619845bc5be5434c362ed8e18652d58e15571e/images/page/explore/global-infrastructure/products-by-region/planned-active.svg':'planned-active',
'//azurecomcdn.azureedge.net/cvt-5983f2707de6e50e5020c6059b619845bc5be5434c362ed8e18652d58e15571e/images/page/explore/global-infrastructure/products-by-region/preview-active.svg':'preview-active',
'//azurecomcdn.azureedge.net/cvt-5983f2707de6e50e5020c6059b619845bc5be5434c362ed8e18652d58e15571e/images/page/explore/global-infrastructure/products-by-region/preview.svg':'preview'
}
df.replace(mapper)
Output
Canada
United States
Category
Products
Canada Central
Canada East
East US 2
0
AI + machine learning
Azure Databricks
hook
hook
hook
1
AI + machine learning
Azure Bot Services
Not available
Not available
Not available
2
AI + machine learning
Azure Cognitive Search
hook
hook
hook
3
AI + machine learning
Microsoft Genomics
Not available
Not available
hook
4
AI + machine learning
Azure Machine Learning
hook
hook
hook
9613
Web
Azure Web PubSub
hook
hook
hook
9614
Web
Azure Fluid Relay
planned-active
Not available
hook
9615
Virtual desktop infrastructure
Azure Virtual Desktop
Not available
Not available
Not available
9616
Virtual desktop infrastructure
Azure Lab Services
hook
hook
hook
9617
Virtual desktop infrastructure
Microsoft Dev Box
preview
Not available
preview
I am not sure if it should be called multi index. Here is the page I am trying to get data from:
Azure product availability by region.
There is hierarchy level: class "category-row" –> "service-row" –> "capability-row" .
pandas.read_html give me a flat table, with all values from three classes. Is there a way to get the hierarchy data?
Here is the code
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import pandas as pd
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.implicitly_wait(30)
url = url = 'https://azure.microsoft.com/en-us/explore/global-infrastructure/products-by-region/?regions=us-east-2,canada-central,canada-east&products=all'
driver.get(url)
tree = BeautifulSoup(driver.find_element_by_id("primary-table").get_attribute('outerHTML'), "html5lib")
table = tree.find('table', class_='primary-table')
header_list = table.find('tr', {'class': 'region-headers-row'}).find_all('th')
df = pd.read_html(driver.find_element_by_id("primary-table").get_attribute('outerHTML'), header=0)[0].iloc[:, :len(header_list)]``
Not sure, if it fit your needs, but it is also take the table contents – May provide an expected result.
Example
...
data=[]
soup = BeautifulSoup(driver.page_source)
for r in soup.select('table tr.service-row:has([data-region-slug])'):
row = [
r.find_previous('tr', attrs={'class':'category-row'}).th.get_text(strip=True),
r.th.get_text(strip=True)
]
for c in r.select('td'):
if c.img:
row.append(c.img.get('src'))
else:
row.append(c.span.text)
data.append(row)
df = pd.DataFrame(data, columns=['Category']+list(soup.table.stripped_strings))
df.columns = pd.MultiIndex.from_tuples(
list(
zip(
['','']+[c.get('data-colgroup') for c in soup.table.select('th[data-colgroup]')],
df.columns)
)
)
df
mapper = {'//azurecomcdn.azureedge.net/cvt-5983f2707de6e50e5020c6059b619845bc5be5434c362ed8e18652d58e15571e/images/page/explore/global-infrastructure/products-by-region/ga.svg':'hook',
'//azurecomcdn.azureedge.net/cvt-5983f2707de6e50e5020c6059b619845bc5be5434c362ed8e18652d58e15571e/images/page/explore/global-infrastructure/products-by-region/planned-active.svg':'planned-active',
'//azurecomcdn.azureedge.net/cvt-5983f2707de6e50e5020c6059b619845bc5be5434c362ed8e18652d58e15571e/images/page/explore/global-infrastructure/products-by-region/preview-active.svg':'preview-active',
'//azurecomcdn.azureedge.net/cvt-5983f2707de6e50e5020c6059b619845bc5be5434c362ed8e18652d58e15571e/images/page/explore/global-infrastructure/products-by-region/preview.svg':'preview'
}
df.replace(mapper)
Output
Canada | United States | ||||
---|---|---|---|---|---|
Category | Products | Canada Central | Canada East | East US 2 | |
0 | AI + machine learning | Azure Databricks | hook | hook | hook |
1 | AI + machine learning | Azure Bot Services | Not available | Not available | Not available |
2 | AI + machine learning | Azure Cognitive Search | hook | hook | hook |
3 | AI + machine learning | Microsoft Genomics | Not available | Not available | hook |
4 | AI + machine learning | Azure Machine Learning | hook | hook | hook |
9613 | Web | Azure Web PubSub | hook | hook | hook |
9614 | Web | Azure Fluid Relay | planned-active | Not available | hook |
9615 | Virtual desktop infrastructure | Azure Virtual Desktop | Not available | Not available | Not available |
9616 | Virtual desktop infrastructure | Azure Lab Services | hook | hook | hook |
9617 | Virtual desktop infrastructure | Microsoft Dev Box | preview | Not available | preview |