python selenium getting urls from google search results

Question:

I am trying to get firt 10 urls from google search results with selenium. I knew that there was other term than inerHTML which will give me the text inside cite tags.

here is code

#open google
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys

chrome_options = Options()
chrome_options.headless = False
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
driver.get('https://www.google.com/')

#paste - write name
#var_inp=input('Write the name to search:')
var_inp='python google search'
#search for image
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys(var_inp+Keys.RETURN)
#find first 10 companies
res_lst=[]
res=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,'cite')))
print(len(res))
for r in res:
    print(r.get_attribute('innerHTML'))

#take email addresses from company
#send email

the result is below

https://github.com<span class="dyjrff qzEoUe" role="text"> › opsdisk</span>
https://blog.apilayer.com<span class="dyjrff qzEoUe" role="text"> › h...</span>
https://blog.apilayer.com<span class="dyjrff qzEoUe" role="text"> › h...</span>

I want to get rid of <span... as I need only urls. I can get off them with reg.ex but I need get_attribute('TEXT') or sth else that will easily give the result.

Asked By: xlmaster

||

Answers:

This is for this specific case:

def remove_span(string):
  start = string.find("<span")
  end = string.find("</span>") + len("</span>")
  return string[:start] + string[end:]

The function manipulates the string and removes the span from it.

for r in res:
    print(removeSpan(r.get_attribute('innerHTML'))) # returns https://github.com
Answered By: Juan Melnechuk

The best way to get the value of the node to use javascripts executor and use the firstchild of the node to get the value.

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
driver.get('https://www.google.com/')

#paste - write name
#var_inp=input('Write the name to search:')
var_inp='python google search'
#search for image
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys(var_inp+Keys.RETURN)
#find first 10 companies
res_lst=[]
res=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,'cite')))
print(len(res))
for r in res:
    print(driver.execute_script("return arguments[0].firstChild.textContent;", r))

Output:

27
https://pypi.org
https://pypi.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://stackoverflow.com
https://stackoverflow.com
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.jcchouinard.com
https://www.jcchouinard.com
https://www.educative.io
https://www.educative.io
https://python-googlesearch.readthedocs.io
https://python-googlesearch.readthedocs.io
https://medium.com
https://medium.com
https://medium.com
https://medium.com
https://github.com
https://github.com
https://github.com
https://github.com
Answered By: KunduK