Unable to scrape google images selenium
Question:
I have the following script which i want it to scrapes google images. It clicks on the image first and then clicks on next (>)
button to switch to the next image.
It downloads the first image, but when it’s turn of the second image then it throws me an error.
Traceback (most recent call last):
File "c:/Users/intel/Desktop/Scrappr/image_scrape.pyw", line 40, in <module>
attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CLASS_NAME, 'n3VNCb'))).get_attribute("src")
File "C:UsersintelAppDataLocalProgramsPythonPython38libsite-packagesseleniumwebdriversupportwait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
My code :
import requests
import shutil
import time
import urllib
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as Soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
driver = webdriver.Chrome(executable_path=r"C:UsersintelDownloadssetupschromedriver.exe", options=options)
driver.get("https://www.google.com/search?q=mac+beautiful+ui&tbm=isch&ved=2ahUKEwiL3ILMveToAhWGCHIKHVPNAScQ2-cCegQIABAA&oq=mac+beautiful+ui&gs_lcp=CgNpbWcQAzoECAAQQzoCCAA6BQgAEIMBOgYIABAFEB46BggAEAgQHlDPI1iEUWCgU2gAcAB4AIAByAKIAd8dkgEHMC40LjkuM5gBAKABAaoBC2d3cy13aXotaW1n&sclient=img&ei=Q9-TXsuuMoaRyAPTmoe4Ag&bih=657&biw=1360")
driver.find_element_by_class_name("rg_i").click()
i = 0
while i < 10:
i += 1
time.sleep(5)
attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'img.n3VNCb'))).get_attribute("src")
print(attribute_value)
resp = requests.get(attribute_value, stream=True)
local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
driver.find_element_by_xpath("""//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()
Answers:
Disclaimer: I doubt that Google allows scraping on Search. You should check out https://www.google.com/robots.txt to find out.
That being said, I think there is a problem in your WebDriverWait
method, though I am not sure what exactly it is. Since you already have your driver wait before that with time.sleep
, I just tried to find the element directly, and it worked:
i = 0
while i < 10:
i += 1
time.sleep(5)
attribute_value = driver.find_element_by_css_selector("img.n3VNCb").get_attribute("src") # NEW LINE
print(attribute_value)
resp = requests.get(attribute_value, stream=True)
local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
driver.find_element_by_xpath("""//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()
I’ve tidied up and refactored a bit your code. The final result is capable of grabbing n amount of images for keywords of your choice (see SEARCH_TERMS
):
import base64
import os
import requests
import time
from io import BytesIO
from PIL import Image
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
CHROME_DRIVER_LOCATION = r'C:UsersintelDownloadssetupschromedriver.exe'
SEARCH_TERMS = ['very', 'hot', 'chicks']
TARGET_SAVE_LOCATION = os.path.join(r'c:test', '_'.join([x.capitalize() for x in SEARCH_TERMS]), r'{}.{}')
if not os.path.isdir(os.path.dirname(TARGET_SAVE_LOCATION)):
os.makedirs(os.path.dirname(TARGET_SAVE_LOCATION))
def check_if_result_b64(source):
possible_header = source.split(',')[0]
if possible_header.startswith('data') and ';base64' in possible_header:
image_type = possible_header.replace('data:image/', '').replace(';base64', '')
return image_type
return False
def get_driver():
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
new_driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION, options=options)
new_driver.get(f"https://www.google.com/search?q={'+'.join(SEARCH_TERMS)}&source=lnms&tbm=isch&sa=X")
return new_driver
driver = get_driver()
first_search_result = driver.find_elements_by_xpath('//a/div/img')[0]
first_search_result.click()
right_panel_base = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'''//*[@data-query="{' '.join(SEARCH_TERMS)}"]''')))
first_image = right_panel_base.find_elements_by_xpath('//*[@data-noaft="1"]')[0]
magic_class = first_image.get_attribute('class')
image_finder_xp = f'//*[@class="{magic_class}"]'
# initial wait for the first image to be loaded
# this part could be improved but I couldn't find a proper way of doing it
time.sleep(3)
# initial thumbnail for "to_be_loaded image"
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
for i in range(10):
# issue 4: All image elements share the same class. Assuming that you always click "next":
# The last element is the base64 encoded thumbnail version is of the "next image"
# [-2] element is the element currently displayed
target = driver.find_elements_by_xpath(image_finder_xp)[-2]
# you need to wait until image is completely loaded:
# first the base64 encoded thumbnail will be displayed
# so we check if the displayed element src match the cached thumbnail src.
# However sometimes the final result is the base64 content, so wait is capped
# at 5 seconds.
wait_time_start = time.time()
while (target.get_attribute("src") == thumbnail_src) and time.time() < wait_time_start + 5:
time.sleep(0.2)
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
attribute_value = target.get_attribute("src")
print(attribute_value)
# issue 1: if the image is base64, requests get won't work because the src is not an url
is_b64 = check_if_result_b64(attribute_value)
if is_b64:
image_format = is_b64
content = base64.b64decode(attribute_value.split(';base64')[1])
else:
resp = requests.get(attribute_value, stream=True)
temp_for_image_extension = BytesIO(resp.content)
image = Image.open(temp_for_image_extension)
image_format = image.format
content = resp.content
# issue 2: if you 'open' a file, later you have to close it. Use a "with" pattern instead
with open(TARGET_SAVE_LOCATION.format(i, image_format), 'wb') as f:
f.write(content)
# issue 3: this Xpath is bad """//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""" if page layout changes, this path breaks instantly
svg_arrows_xpath = '//div[@jscontroller]//a[contains(@jsaction, "click:trigger")]//*[@viewBox="0 0 24 24"]'
next_arrow = driver.find_elements_by_xpath(svg_arrows_xpath)[-3]
next_arrow.click()
selenium
could take a lot of time and resources (CPU, RAM), so you can scrape Google Images data from inline JSON with using BeautifulSoup web scraping library and regular expressions.
The data you need is located in page source in the inline JSON.
In order to locate and extract data from inline JSON you need:
- open page source
CTRL + U
;
- find the data you need with
CTRL + F
search option;
- use regular expressions to extract parts of inline Json:
# https://regex101.com/r/4OVJHX/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback(([^<]+));", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/WELgPS/1
matched_google_image_data = re.findall(r'"b-GRID_STATE0"(.*)sideChannel:s?{}}', matched_images_data_json)
# https://regex101.com/r/gfpJGH/1
matched_google_images_thumbnails = ", ".join(
re.findall(r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]',
str(matched_google_image_data))).split(", ")
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),["(https:|http.*?)",d+,d+]", removed_matched_google_images_thumbnails)
Check full code in the online IDE.
import requests, re, json, lxml
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
google_images = []
params = {
"q": "mac beautiful ui", # search query
"tbm": "isch", # image results
"hl": "en", # language of the search
"gl": "us" # country where search comes fro
}
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
all_script_tags = soup.select("script")
# https://regex101.com/r/4OVJHX/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback(([^<]+));", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/WELgPS/1
matched_google_image_data = re.findall(r'"b-GRID_STATE0"(.*)sideChannel:s?{}}', matched_images_data_json)
# https://regex101.com/r/gfpJGH/1
matched_google_images_thumbnails = ", ".join(
re.findall(r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]',
str(matched_google_image_data))).split(", ")
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),["(https:|http.*?)",d+,d+]", removed_matched_google_images_thumbnails)
full_res_images = [
bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
]
for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
google_images.append({
"title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
"link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
"source": metadata.select_one(".fxgdke").text,
"thumbnail": thumbnail,
"original": original
})
print(json.dumps(google_images, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "macOS UI Design – The Best Free Resources & Inspiration | UXMISFIT.COM",
"link": "https://uxmisfit.com/2017/01/16/macos-ui-design-the-best-free-resources-inspiration/",
"source": "UXMISFIT.COM",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR8H9dXnxZVu_g0GVPY9np8a3ZkwsIFOBrO5No9EZic-X1ykSvWVVWrsGlXxL4sOr5j9lo&usqp=CAU",
"original": "https://uxmisfit.com/wp-content/uploads/2016/12/alexander_2uikit-1024x576.png"
},
{
"title": "Design a beautiful "dark-mode" ui for simple mac app | App design contest | 99designs",
"link": "https://99designs.com/mobile-app-design/contests/design-beautiful-dark-mode-ui-simple-mac-app-847502",
"source": "99Designs",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTfOgfPHVtNLSGIUrfE7p0sNCVvFV-CZhxxfZ6wAIU2C037IjkZkInnB0nlKvCxwjOqGNc&usqp=CAU",
"original": "https://images-platform.99static.com/mvyP91zYUsrpXUvhqBjNzDy4DqY=/3x0:1618x1615/500x500/top/smart/99designs-contests-attachments/99/99210/attachment_99210540"
},
other results ...
]
Also you can use Google Images API from SerpApi. It’s a paid API with the free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Simple code example:
from serpapi import GoogleSearch
import json
image_results = []
# search query parameters
params = {
"engine": "google", # search engine. Google, Bing, Yahoo, Naver, Baidu...
"q": "mac beautiful ui", # search query
"tbm": "isch", # image results
"num": "100", # number of images per page
"ijn": 0, # page number: 0 -> first page, 1 -> second...
"api_key": "..." # serpapi key, https://serpapi.com/manage-api-key
# other query parameters: hl (lang), gl (country), etc
}
search = GoogleSearch(params) # where data extraction happens
images_is_present = True
while images_is_present:
results = search.get_dict() # JSON -> Python dictionary
# checks for "Google hasn't returned any results for this query."
if "error" not in results:
for image in results["images_results"]:
if image["original"] not in image_results:
image_results.append(image["original"])
# update to the next page
params["ijn"] += 1
else:
images_is_present = False
print(results["error"])
print(json.dumps(image_results, indent=2))
Output:
[
"https://cdn.dribbble.com/users/422170/screenshots/11461839/powered-kiss-principle_def.png?compress=1&resize=400x300",
"https://cdn.dribbble.com/users/226242/screenshots/18225016/media/3ab1bf230107da033342633cf7b09733.png?compress=1&resize=400x300",
"https://cdn.dribbble.com/userupload/4045436/file/original-43340f87ef951b4da0343d93dba54373.png?resize=400x0",
"https://cdn.dribbble.com/users/2404/screenshots/15447041/media/d7861d9863e2305a0ad8adb415a662dd.png?compress=1&resize=400x300",
"https://cdn.dribbble.com/userupload/4089487/file/original-6182d105d299d35d88d92043bf75c100.png?resize=400x0",
"https://cdn.dribbble.com/users/606683/screenshots/17054778/media/3d93194d1946de60585cefa299d9acd7.png?compress=1&resize=400x300",
other results ...
]
There’s a Scrape and download Google Images with Python blog post if you need a little bit more code explanation.
I have the following script which i want it to scrapes google images. It clicks on the image first and then clicks on next (>)
button to switch to the next image.
It downloads the first image, but when it’s turn of the second image then it throws me an error.
Traceback (most recent call last):
File "c:/Users/intel/Desktop/Scrappr/image_scrape.pyw", line 40, in <module>
attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CLASS_NAME, 'n3VNCb'))).get_attribute("src")
File "C:UsersintelAppDataLocalProgramsPythonPython38libsite-packagesseleniumwebdriversupportwait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
My code :
import requests
import shutil
import time
import urllib
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as Soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
driver = webdriver.Chrome(executable_path=r"C:UsersintelDownloadssetupschromedriver.exe", options=options)
driver.get("https://www.google.com/search?q=mac+beautiful+ui&tbm=isch&ved=2ahUKEwiL3ILMveToAhWGCHIKHVPNAScQ2-cCegQIABAA&oq=mac+beautiful+ui&gs_lcp=CgNpbWcQAzoECAAQQzoCCAA6BQgAEIMBOgYIABAFEB46BggAEAgQHlDPI1iEUWCgU2gAcAB4AIAByAKIAd8dkgEHMC40LjkuM5gBAKABAaoBC2d3cy13aXotaW1n&sclient=img&ei=Q9-TXsuuMoaRyAPTmoe4Ag&bih=657&biw=1360")
driver.find_element_by_class_name("rg_i").click()
i = 0
while i < 10:
i += 1
time.sleep(5)
attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'img.n3VNCb'))).get_attribute("src")
print(attribute_value)
resp = requests.get(attribute_value, stream=True)
local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
driver.find_element_by_xpath("""//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()
Disclaimer: I doubt that Google allows scraping on Search. You should check out https://www.google.com/robots.txt to find out.
That being said, I think there is a problem in your WebDriverWait
method, though I am not sure what exactly it is. Since you already have your driver wait before that with time.sleep
, I just tried to find the element directly, and it worked:
i = 0
while i < 10:
i += 1
time.sleep(5)
attribute_value = driver.find_element_by_css_selector("img.n3VNCb").get_attribute("src") # NEW LINE
print(attribute_value)
resp = requests.get(attribute_value, stream=True)
local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
driver.find_element_by_xpath("""//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()
I’ve tidied up and refactored a bit your code. The final result is capable of grabbing n amount of images for keywords of your choice (see SEARCH_TERMS
):
import base64
import os
import requests
import time
from io import BytesIO
from PIL import Image
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
CHROME_DRIVER_LOCATION = r'C:UsersintelDownloadssetupschromedriver.exe'
SEARCH_TERMS = ['very', 'hot', 'chicks']
TARGET_SAVE_LOCATION = os.path.join(r'c:test', '_'.join([x.capitalize() for x in SEARCH_TERMS]), r'{}.{}')
if not os.path.isdir(os.path.dirname(TARGET_SAVE_LOCATION)):
os.makedirs(os.path.dirname(TARGET_SAVE_LOCATION))
def check_if_result_b64(source):
possible_header = source.split(',')[0]
if possible_header.startswith('data') and ';base64' in possible_header:
image_type = possible_header.replace('data:image/', '').replace(';base64', '')
return image_type
return False
def get_driver():
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
new_driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION, options=options)
new_driver.get(f"https://www.google.com/search?q={'+'.join(SEARCH_TERMS)}&source=lnms&tbm=isch&sa=X")
return new_driver
driver = get_driver()
first_search_result = driver.find_elements_by_xpath('//a/div/img')[0]
first_search_result.click()
right_panel_base = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'''//*[@data-query="{' '.join(SEARCH_TERMS)}"]''')))
first_image = right_panel_base.find_elements_by_xpath('//*[@data-noaft="1"]')[0]
magic_class = first_image.get_attribute('class')
image_finder_xp = f'//*[@class="{magic_class}"]'
# initial wait for the first image to be loaded
# this part could be improved but I couldn't find a proper way of doing it
time.sleep(3)
# initial thumbnail for "to_be_loaded image"
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
for i in range(10):
# issue 4: All image elements share the same class. Assuming that you always click "next":
# The last element is the base64 encoded thumbnail version is of the "next image"
# [-2] element is the element currently displayed
target = driver.find_elements_by_xpath(image_finder_xp)[-2]
# you need to wait until image is completely loaded:
# first the base64 encoded thumbnail will be displayed
# so we check if the displayed element src match the cached thumbnail src.
# However sometimes the final result is the base64 content, so wait is capped
# at 5 seconds.
wait_time_start = time.time()
while (target.get_attribute("src") == thumbnail_src) and time.time() < wait_time_start + 5:
time.sleep(0.2)
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
attribute_value = target.get_attribute("src")
print(attribute_value)
# issue 1: if the image is base64, requests get won't work because the src is not an url
is_b64 = check_if_result_b64(attribute_value)
if is_b64:
image_format = is_b64
content = base64.b64decode(attribute_value.split(';base64')[1])
else:
resp = requests.get(attribute_value, stream=True)
temp_for_image_extension = BytesIO(resp.content)
image = Image.open(temp_for_image_extension)
image_format = image.format
content = resp.content
# issue 2: if you 'open' a file, later you have to close it. Use a "with" pattern instead
with open(TARGET_SAVE_LOCATION.format(i, image_format), 'wb') as f:
f.write(content)
# issue 3: this Xpath is bad """//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""" if page layout changes, this path breaks instantly
svg_arrows_xpath = '//div[@jscontroller]//a[contains(@jsaction, "click:trigger")]//*[@viewBox="0 0 24 24"]'
next_arrow = driver.find_elements_by_xpath(svg_arrows_xpath)[-3]
next_arrow.click()
selenium
could take a lot of time and resources (CPU, RAM), so you can scrape Google Images data from inline JSON with using BeautifulSoup web scraping library and regular expressions.
The data you need is located in page source in the inline JSON.
In order to locate and extract data from inline JSON you need:
- open page source
CTRL + U
; - find the data you need with
CTRL + F
search option; - use regular expressions to extract parts of inline Json:
# https://regex101.com/r/4OVJHX/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback(([^<]+));", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/WELgPS/1
matched_google_image_data = re.findall(r'"b-GRID_STATE0"(.*)sideChannel:s?{}}', matched_images_data_json)
# https://regex101.com/r/gfpJGH/1
matched_google_images_thumbnails = ", ".join(
re.findall(r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]',
str(matched_google_image_data))).split(", ")
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),["(https:|http.*?)",d+,d+]", removed_matched_google_images_thumbnails)
Check full code in the online IDE.
import requests, re, json, lxml
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
google_images = []
params = {
"q": "mac beautiful ui", # search query
"tbm": "isch", # image results
"hl": "en", # language of the search
"gl": "us" # country where search comes fro
}
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
all_script_tags = soup.select("script")
# https://regex101.com/r/4OVJHX/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback(([^<]+));", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/WELgPS/1
matched_google_image_data = re.findall(r'"b-GRID_STATE0"(.*)sideChannel:s?{}}', matched_images_data_json)
# https://regex101.com/r/gfpJGH/1
matched_google_images_thumbnails = ", ".join(
re.findall(r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]',
str(matched_google_image_data))).split(", ")
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),["(https:|http.*?)",d+,d+]", removed_matched_google_images_thumbnails)
full_res_images = [
bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
]
for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
google_images.append({
"title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
"link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
"source": metadata.select_one(".fxgdke").text,
"thumbnail": thumbnail,
"original": original
})
print(json.dumps(google_images, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "macOS UI Design – The Best Free Resources & Inspiration | UXMISFIT.COM",
"link": "https://uxmisfit.com/2017/01/16/macos-ui-design-the-best-free-resources-inspiration/",
"source": "UXMISFIT.COM",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR8H9dXnxZVu_g0GVPY9np8a3ZkwsIFOBrO5No9EZic-X1ykSvWVVWrsGlXxL4sOr5j9lo&usqp=CAU",
"original": "https://uxmisfit.com/wp-content/uploads/2016/12/alexander_2uikit-1024x576.png"
},
{
"title": "Design a beautiful "dark-mode" ui for simple mac app | App design contest | 99designs",
"link": "https://99designs.com/mobile-app-design/contests/design-beautiful-dark-mode-ui-simple-mac-app-847502",
"source": "99Designs",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTfOgfPHVtNLSGIUrfE7p0sNCVvFV-CZhxxfZ6wAIU2C037IjkZkInnB0nlKvCxwjOqGNc&usqp=CAU",
"original": "https://images-platform.99static.com/mvyP91zYUsrpXUvhqBjNzDy4DqY=/3x0:1618x1615/500x500/top/smart/99designs-contests-attachments/99/99210/attachment_99210540"
},
other results ...
]
Also you can use Google Images API from SerpApi. It’s a paid API with the free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Simple code example:
from serpapi import GoogleSearch
import json
image_results = []
# search query parameters
params = {
"engine": "google", # search engine. Google, Bing, Yahoo, Naver, Baidu...
"q": "mac beautiful ui", # search query
"tbm": "isch", # image results
"num": "100", # number of images per page
"ijn": 0, # page number: 0 -> first page, 1 -> second...
"api_key": "..." # serpapi key, https://serpapi.com/manage-api-key
# other query parameters: hl (lang), gl (country), etc
}
search = GoogleSearch(params) # where data extraction happens
images_is_present = True
while images_is_present:
results = search.get_dict() # JSON -> Python dictionary
# checks for "Google hasn't returned any results for this query."
if "error" not in results:
for image in results["images_results"]:
if image["original"] not in image_results:
image_results.append(image["original"])
# update to the next page
params["ijn"] += 1
else:
images_is_present = False
print(results["error"])
print(json.dumps(image_results, indent=2))
Output:
[
"https://cdn.dribbble.com/users/422170/screenshots/11461839/powered-kiss-principle_def.png?compress=1&resize=400x300",
"https://cdn.dribbble.com/users/226242/screenshots/18225016/media/3ab1bf230107da033342633cf7b09733.png?compress=1&resize=400x300",
"https://cdn.dribbble.com/userupload/4045436/file/original-43340f87ef951b4da0343d93dba54373.png?resize=400x0",
"https://cdn.dribbble.com/users/2404/screenshots/15447041/media/d7861d9863e2305a0ad8adb415a662dd.png?compress=1&resize=400x300",
"https://cdn.dribbble.com/userupload/4089487/file/original-6182d105d299d35d88d92043bf75c100.png?resize=400x0",
"https://cdn.dribbble.com/users/606683/screenshots/17054778/media/3d93194d1946de60585cefa299d9acd7.png?compress=1&resize=400x300",
other results ...
]
There’s a Scrape and download Google Images with Python blog post if you need a little bit more code explanation.