How to get scrap web entire page data without physically scrolling?
Question:
I am using the following code to extract information of this webpage, but it only fetches first 18 rows of information. How can I ensure that I am loading 2063 rows of information.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import time
driver = webdriver.Chrome()
url= "https://airtable.com/shrqYt5kSqMzHV9R5/tbl8c8kanuNB6bPYr?backgroundColor=green&viewControls=on"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
table1 = soup.find("div",{"id":"table"})
driver.quit()
company_names = []
for i in table1.find_all('div',class_="line-height-4 overflow-hidden truncate"):
title = i.text
if(title.startswith('http')):
continue
company_names.append(title)
print(company_names)
print(len(company_names))
I have tried the following 4 snippets of code for scrolling through the webpage, but none of them work (I am not sure if this step is even needed):
Method 1:
ScrollNumber = 50
for i in range(1,ScrollNumber):
driver.execute_script("window.scrollTo(1,5000)")#scrolling to said coordinates
time.sleep(2)
for i in range(1,ScrollNumber):
driver.execute_script("window.scrollTo(5000,1)")#scrolling to said coordinates
time.sleep(2)
driver.close()
Method 2:
SCROLL_PAUSE_TIME = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Method 3:
elem = driver.find_element_by_tag_name("body")
no_of_pagedowns = 38
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(1)
no_of_pagedowns-=1
Method 4:
elem = driver.find_element_by_tag_name('body')
elem.send_keys(Keys.END)
Answers:
might not be exactly how you expect but… have a look.
You can use scrapy shell command to scrape the entire data in the webpage, using Scrappy.
import scrapy
from scrapy.http import HtmlResponse
url = "http://example.com"
response = HtmlResponse(url=url, body=requests.get(url).text)
print(response.css('html').get())
A working code using selnium-wire
:
from seleniumwire import webdriver # Import from seleniumwire
from seleniumwire.utils import decode as sw_decode
# Create a new instance of the Chrome driver
driver = webdriver.Chrome()
# Go to the Google home page
driver.get('https://airtable.com/shrqYt5kSqMzHV9R5/tbl8c8kanuNB6bPYr?backgroundColor=green&viewControls=on')
# Access requests via the `requests` attribute
for request in driver.requests:
if request.response:
# Find the target request
if request.url.startswith('https://airtable.com/v0.3/view/viwA14Z1pM69YIsaW/readSharedViewData'):
# Decode byte data into string
data = sw_decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
data = data.decode("utf8")
# Write
with open('data.json', 'w') as file:
file.write(data)
Old answer:
I am not sure if this will help, but if doing it manually also works for you, you can use developer tools for it.
- Press F12 to open developer tools.
- Go to Network tab and refresh the page.
- Find the request which has a large size, starting with
redSharedViewData
- Click on the request
- Go to preview tab
- Right click on the top level and chose
Copy value
- Open a new text file and paste there, and save as JSON
You can now work with the JSON data using python or any other tools of your choosing.
Note: Make sure scrapping the data will not create any legal issues for you first.
I am using the following code to extract information of this webpage, but it only fetches first 18 rows of information. How can I ensure that I am loading 2063 rows of information.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import time
driver = webdriver.Chrome()
url= "https://airtable.com/shrqYt5kSqMzHV9R5/tbl8c8kanuNB6bPYr?backgroundColor=green&viewControls=on"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
table1 = soup.find("div",{"id":"table"})
driver.quit()
company_names = []
for i in table1.find_all('div',class_="line-height-4 overflow-hidden truncate"):
title = i.text
if(title.startswith('http')):
continue
company_names.append(title)
print(company_names)
print(len(company_names))
I have tried the following 4 snippets of code for scrolling through the webpage, but none of them work (I am not sure if this step is even needed):
Method 1:
ScrollNumber = 50
for i in range(1,ScrollNumber):
driver.execute_script("window.scrollTo(1,5000)")#scrolling to said coordinates
time.sleep(2)
for i in range(1,ScrollNumber):
driver.execute_script("window.scrollTo(5000,1)")#scrolling to said coordinates
time.sleep(2)
driver.close()
Method 2:
SCROLL_PAUSE_TIME = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Method 3:
elem = driver.find_element_by_tag_name("body")
no_of_pagedowns = 38
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(1)
no_of_pagedowns-=1
Method 4:
elem = driver.find_element_by_tag_name('body')
elem.send_keys(Keys.END)
might not be exactly how you expect but… have a look.
You can use scrapy shell command to scrape the entire data in the webpage, using Scrappy.
import scrapy
from scrapy.http import HtmlResponse
url = "http://example.com"
response = HtmlResponse(url=url, body=requests.get(url).text)
print(response.css('html').get())
A working code using selnium-wire
:
from seleniumwire import webdriver # Import from seleniumwire
from seleniumwire.utils import decode as sw_decode
# Create a new instance of the Chrome driver
driver = webdriver.Chrome()
# Go to the Google home page
driver.get('https://airtable.com/shrqYt5kSqMzHV9R5/tbl8c8kanuNB6bPYr?backgroundColor=green&viewControls=on')
# Access requests via the `requests` attribute
for request in driver.requests:
if request.response:
# Find the target request
if request.url.startswith('https://airtable.com/v0.3/view/viwA14Z1pM69YIsaW/readSharedViewData'):
# Decode byte data into string
data = sw_decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
data = data.decode("utf8")
# Write
with open('data.json', 'w') as file:
file.write(data)
Old answer:
I am not sure if this will help, but if doing it manually also works for you, you can use developer tools for it.
- Press F12 to open developer tools.
- Go to Network tab and refresh the page.
- Find the request which has a large size, starting with
redSharedViewData
- Click on the request
- Go to preview tab
- Right click on the top level and chose
Copy value
- Open a new text file and paste there, and save as JSON
You can now work with the JSON data using python or any other tools of your choosing.
Note: Make sure scrapping the data will not create any legal issues for you first.