How to use "for" to "loop" multiple urls while webscraping?
Question:
The current code is perfect to scrape the information for only one Url, and i would like to be able to scrape from multiple urls at once ( maybe use For url in Urls ) ,Here is the current code for just one url below. Please any help or direction would be appreciated
import datetime
from bs4 import BeautifulSoup
import requests
def get_url_data_from_url_request(url):
print(">> get_url_data_from_url_request: "+str(url))
url_data = None
headers = {"user-agent": "Mozilla/5.0 (Windows NT
10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/90.0.4430.93 Safari/537.36"}
s = requests.session()
s.keep_alive = False
request = s.get(url, proxies=None, headers=headers)
print("request.status_code: ", request.status_code )
url_data = request.text
request.connection.close()
s.close()
return url_data
def main():
print("bdr.sandbox")
generated_on = datetime.datetime.now()
print(generated_on)
source_product_url = ""
url_data = get_url_data_from_url_request(url=source_product_url)
soup = BeautifulSoup(url_data, "lxml")
id_element = soup.find('span', {"itemprop": "sku"}).text
print(id_element)
if __name__ == '__main__':
main()
Answers:
There are countless ways to do it; this is one example: parametrize source_product_url
in main
and call it in loop. (Add return id_element
if you want to save the data for future use.)
def get_sku(source_product_url):
print("bdr.sandbox")
generated_on = datetime.datetime.now()
print(generated_on)
url_data = get_url_data_from_url_request(url=source_product_url)
soup = BeautifulSoup(url_data, "lxml")
id_element = soup.find('span', {"itemprop": "sku"}).text
print(id_element)
return id_element
You could just do for url in urlsList: get_sku(url)
at this point, but if you want to collect some data [and also make sure that, even if any errors are raised while scraping some of the pages, the program still attempts to scrape the rest], then you could do something like
def try_get_sku(srcProdUrl):
try: return {'source_product_url': srcProdUrl, 'sku': get_sku(srcProdUrl)}
except Exception as e:
errorMsg = f'{type(e)} {e}'
print(f'failed to scrape {srcProdUrl} n ', errorMsg)
return {'source_product_url': srcProdUrl, 'errorMsg': errorMsg}
( This also facilitates list comprehension )
def main():
spUrls = [
'https://www.6pm.com/p/easy-spirit-epic-gray/product/9450972/color/11',
## ADD ALL URLS TO SCRAPE ##
]
productsInfo = [try_get_sku(spUrls) for url in spUrls]
# do something with productsInfo if you want to, like
# pandas.DataFrame(productsInfo).to_csv('productsInfo.csv') # saves in csv file [import pandas]
The current code is perfect to scrape the information for only one Url, and i would like to be able to scrape from multiple urls at once ( maybe use For url in Urls ) ,Here is the current code for just one url below. Please any help or direction would be appreciated
import datetime
from bs4 import BeautifulSoup
import requests
def get_url_data_from_url_request(url):
print(">> get_url_data_from_url_request: "+str(url))
url_data = None
headers = {"user-agent": "Mozilla/5.0 (Windows NT
10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/90.0.4430.93 Safari/537.36"}
s = requests.session()
s.keep_alive = False
request = s.get(url, proxies=None, headers=headers)
print("request.status_code: ", request.status_code )
url_data = request.text
request.connection.close()
s.close()
return url_data
def main():
print("bdr.sandbox")
generated_on = datetime.datetime.now()
print(generated_on)
source_product_url = ""
url_data = get_url_data_from_url_request(url=source_product_url)
soup = BeautifulSoup(url_data, "lxml")
id_element = soup.find('span', {"itemprop": "sku"}).text
print(id_element)
if __name__ == '__main__':
main()
There are countless ways to do it; this is one example: parametrize source_product_url
in main
and call it in loop. (Add return id_element
if you want to save the data for future use.)
def get_sku(source_product_url):
print("bdr.sandbox")
generated_on = datetime.datetime.now()
print(generated_on)
url_data = get_url_data_from_url_request(url=source_product_url)
soup = BeautifulSoup(url_data, "lxml")
id_element = soup.find('span', {"itemprop": "sku"}).text
print(id_element)
return id_element
You could just do for url in urlsList: get_sku(url)
at this point, but if you want to collect some data [and also make sure that, even if any errors are raised while scraping some of the pages, the program still attempts to scrape the rest], then you could do something like
def try_get_sku(srcProdUrl):
try: return {'source_product_url': srcProdUrl, 'sku': get_sku(srcProdUrl)}
except Exception as e:
errorMsg = f'{type(e)} {e}'
print(f'failed to scrape {srcProdUrl} n ', errorMsg)
return {'source_product_url': srcProdUrl, 'errorMsg': errorMsg}
( This also facilitates list comprehension )
def main():
spUrls = [
'https://www.6pm.com/p/easy-spirit-epic-gray/product/9450972/color/11',
## ADD ALL URLS TO SCRAPE ##
]
productsInfo = [try_get_sku(spUrls) for url in spUrls]
# do something with productsInfo if you want to, like
# pandas.DataFrame(productsInfo).to_csv('productsInfo.csv') # saves in csv file [import pandas]