scrape the overview
Question:
I wonder why I cannot scrape this company overview. An example is that I want to scrape Walmart’s size, which is 10000+ employees. Below is my code, not sure why the info I am looking for is not there…
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = f'https://www.glassdoor.com/Overview/Working-at-Walmart-EI_IE715.11,18.htm'
# f'https://www.glassdoor.com/Reviews/Google-Engineering-Reviews-EI_IE9079.0,6_DEPT1007_IP{pg}.htm?sort.sortType=RD&sort.ascending=false&filter.iso3Language=eng'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
Answers:
Here is one possible solution:
import re
import json
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0'
}
with requests.Session() as session:
session.headers.update(headers)
raw_data = session.get(f'https://www.glassdoor.com/Overview/Working-at-Walmart-EI_IE715.htm').text
script = [s.text for s in BeautifulSoup(raw_data, "lxml").find_all("script") if "window.appCache" in s.text][0]
json_data = json.loads(re.findall(r'("Employer:d+":)(.+)(,"ROOT_QUERY")', script)[0][1])
data = {
"id": json_data["id"],
"shortName": json_data["shortName"],
"website": json_data["website"],
"type": json_data["type"],
"revenue": json_data["revenue"],
"headquarters": json_data["headquarters"],
"size": json_data["size"],
"yearFounded": json_data["yearFounded"]
}
print(data)
Output:
{
'id': 715,
'shortName': 'Walmart',
'website': 'careers.walmart.com',
'type': 'Company - Public',
'revenue': '$10+ billion (USD)',
'headquarters': 'Bentonville, AR',
'size': '10000+ Employees',
'yearFounded': 1962
}
If you only need "size" then just use e.g. size = json_data["size"]
I wonder why I cannot scrape this company overview. An example is that I want to scrape Walmart’s size, which is 10000+ employees. Below is my code, not sure why the info I am looking for is not there…
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = f'https://www.glassdoor.com/Overview/Working-at-Walmart-EI_IE715.11,18.htm'
# f'https://www.glassdoor.com/Reviews/Google-Engineering-Reviews-EI_IE9079.0,6_DEPT1007_IP{pg}.htm?sort.sortType=RD&sort.ascending=false&filter.iso3Language=eng'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
Here is one possible solution:
import re
import json
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0'
}
with requests.Session() as session:
session.headers.update(headers)
raw_data = session.get(f'https://www.glassdoor.com/Overview/Working-at-Walmart-EI_IE715.htm').text
script = [s.text for s in BeautifulSoup(raw_data, "lxml").find_all("script") if "window.appCache" in s.text][0]
json_data = json.loads(re.findall(r'("Employer:d+":)(.+)(,"ROOT_QUERY")', script)[0][1])
data = {
"id": json_data["id"],
"shortName": json_data["shortName"],
"website": json_data["website"],
"type": json_data["type"],
"revenue": json_data["revenue"],
"headquarters": json_data["headquarters"],
"size": json_data["size"],
"yearFounded": json_data["yearFounded"]
}
print(data)
Output:
{
'id': 715,
'shortName': 'Walmart',
'website': 'careers.walmart.com',
'type': 'Company - Public',
'revenue': '$10+ billion (USD)',
'headquarters': 'Bentonville, AR',
'size': '10000+ Employees',
'yearFounded': 1962
}
If you only need "size" then just use e.g. size = json_data["size"]