python web scraping for emails
Question:
I wrote this code to scrape email addresses from google search results or websites depending on t url given. However, the output is always blank.
The only thing in the excel sheet is the column name. I’m still new to python so not sure why that’s happening.
What am I missing here?
import requests
from bs4 import BeautifulSoup
import pandas as pd
url ="https://www.google.com/search?q=solicitor+bereavement+wales+%27email%27&rlz=1C1CHBD_en-GBIT1013IT1013&sxsrf=AJOqlzWelf5qGpc4uqy_C2cd583OKlSEcQ%3A1675616694195&ei=tuHfY83MC-aIrwSQ3qxY&ved=0ahUKEwjN_9jO7v78AhVmxIsKHRAvCwsQ4dUDCBA&uact=5&oq=solicitor+bereavement+wales+%27email%27&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIFCAAQogQyBwgAEB4QogQyBwgAEB4QogQyBwgAEB4QogQyBwgAEB4QogQ6CggAEEcQ1gQQsANKBAhBGABKBAhGGABQrAxY7xRg1xZoAXABeACAAdIBiAGmBpIBBTEuNC4xmAEAoAEByAEIwAEB&sclient=gws-wiz-serp"
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
email_addresses = []
for link in soup.find_all('a'):
if 'mailto:' in link.get('href'):
email_addresses.append(link.get('href').replace('mailto:', ''))
df = pd.DataFrame(email_addresses, columns=['Email Addresses'])
df.to_excel('email_addresses_.xlsx',index=False)
Answers:
It’s not finding the html you want because the html is loaded dynamically with javascript. Thus you need to execute the javascript to get all the html.
The selenium module can be used to do this, but it requires a driver to interface with a given browser. So you’ll need to install a browser driver in order to use the selenium module. The selenium documentation goes over the installation
Once you have selenium setup, you can use this function to get all the html from the website. Pass its return value into the BeautifulSoup object.
from selenium import webdriver
from time import sleep
def get_page_source(url):
try:
driver = webdriver.Chrome()
driver.get(url)
sleep(3)
return driver.page_source
finally: driver.quit()
First you need to extract all the snippets on the page:
for result in soup.select('.tF2Cxc'):
snippet = result.select_one('.lEBKkf').text
After using regular expression, it will get the email from the snippets (if it’s present in the snippet):
match_email = re.findall(r'[w.-]+@[w.-]+.w+', snippet)
email = ''.join(match_email)
Also, instead of a request for a full URL, you can make a request for certain parameters (it’s convenient if you need to change query or other parameters):
params = {
'q': 'intext:"gmail.com" solicitor bereavement wale', # your query
'hl': 'en', # language
'gl': 'us' # country of the search, US -> USA
# other parameters
}
Check full code in the online IDE.
import requests, re, json, lxml
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
params = {
'q': 'intext:"gmail.com" solicitor bereavement wale', # your query
'hl': 'en', # language
'gl': 'us' # country of the search, US -> USA
}
html = requests.get("https://www.google.com/search",
headers=headers,
params=params).text
soup = BeautifulSoup(html, 'lxml')
data = []
for result in soup.select('.tF2Cxc'):
title = result.select_one('.DKV0Md').text
link = result.find('a')['href']
snippet = result.select_one('.lEBKkf').text
match_email = re.findall(r'[w.-]+@[w.-]+.w+', snippet)
email = ''.join(match_email)
data.append({
'Title': title,
'Link': link,
'Email': email if email else None
})
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"Title": "Revealed: Billboard's 2022 Top Music Lawyers",
"Link": "https://www.billboard.com/wp-content/uploads/2022/03/march-28-2022-billboard-bulletin.pdf",
"Email": "[email protected]"
},
{
"Title": "Folakemi Jegede, LL.B, BL, LLM, ACIS.'s Post - LinkedIn",
"Link": "https://www.linkedin.com/posts/folakemi-jegede-ll-b-bl-llm-acis-855a8a2a_lawyers-law-advocate-activity-6934498515867815936-9R6G?trk=posts_directory",
"Email": "[email protected]"
},
other results ...
]
Also you can use Google Search Engine Results API from SerpApi. It’s a paid API with a free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
import os, json, re
params = {
"engine": "google", # search engine
"q": 'intext:"gmail.com" solicitor bereavement wale', # search query
"api_key": "..." # serpapi key from https://serpapi.com/manage-api-key
}
search = GoogleSearch(params) # where data extraction happens
results = search.get_dict() # JSON -> Python dictionary
data = []
for result in results['organic_results']:
title = result['title']
link = result['link']
snippet = result['snippet']
match_email = re.findall(r'[w.-]+@[w.-]+.w+', snippet)
email = 'n'.join(match_email)
data.append({
'title': title,
'link': link,
'email': email if email else None,
})
print(json.dumps(data, indent=2, ensure_ascii=False))
Output: exactly the same as in the previous solution.
I wrote this code to scrape email addresses from google search results or websites depending on t url given. However, the output is always blank.
The only thing in the excel sheet is the column name. I’m still new to python so not sure why that’s happening.
What am I missing here?
import requests
from bs4 import BeautifulSoup
import pandas as pd
url ="https://www.google.com/search?q=solicitor+bereavement+wales+%27email%27&rlz=1C1CHBD_en-GBIT1013IT1013&sxsrf=AJOqlzWelf5qGpc4uqy_C2cd583OKlSEcQ%3A1675616694195&ei=tuHfY83MC-aIrwSQ3qxY&ved=0ahUKEwjN_9jO7v78AhVmxIsKHRAvCwsQ4dUDCBA&uact=5&oq=solicitor+bereavement+wales+%27email%27&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIFCAAQogQyBwgAEB4QogQyBwgAEB4QogQyBwgAEB4QogQyBwgAEB4QogQ6CggAEEcQ1gQQsANKBAhBGABKBAhGGABQrAxY7xRg1xZoAXABeACAAdIBiAGmBpIBBTEuNC4xmAEAoAEByAEIwAEB&sclient=gws-wiz-serp"
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
email_addresses = []
for link in soup.find_all('a'):
if 'mailto:' in link.get('href'):
email_addresses.append(link.get('href').replace('mailto:', ''))
df = pd.DataFrame(email_addresses, columns=['Email Addresses'])
df.to_excel('email_addresses_.xlsx',index=False)
It’s not finding the html you want because the html is loaded dynamically with javascript. Thus you need to execute the javascript to get all the html.
The selenium module can be used to do this, but it requires a driver to interface with a given browser. So you’ll need to install a browser driver in order to use the selenium module. The selenium documentation goes over the installation
Once you have selenium setup, you can use this function to get all the html from the website. Pass its return value into the BeautifulSoup object.
from selenium import webdriver
from time import sleep
def get_page_source(url):
try:
driver = webdriver.Chrome()
driver.get(url)
sleep(3)
return driver.page_source
finally: driver.quit()
First you need to extract all the snippets on the page:
for result in soup.select('.tF2Cxc'):
snippet = result.select_one('.lEBKkf').text
After using regular expression, it will get the email from the snippets (if it’s present in the snippet):
match_email = re.findall(r'[w.-]+@[w.-]+.w+', snippet)
email = ''.join(match_email)
Also, instead of a request for a full URL, you can make a request for certain parameters (it’s convenient if you need to change query or other parameters):
params = {
'q': 'intext:"gmail.com" solicitor bereavement wale', # your query
'hl': 'en', # language
'gl': 'us' # country of the search, US -> USA
# other parameters
}
Check full code in the online IDE.
import requests, re, json, lxml
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
params = {
'q': 'intext:"gmail.com" solicitor bereavement wale', # your query
'hl': 'en', # language
'gl': 'us' # country of the search, US -> USA
}
html = requests.get("https://www.google.com/search",
headers=headers,
params=params).text
soup = BeautifulSoup(html, 'lxml')
data = []
for result in soup.select('.tF2Cxc'):
title = result.select_one('.DKV0Md').text
link = result.find('a')['href']
snippet = result.select_one('.lEBKkf').text
match_email = re.findall(r'[w.-]+@[w.-]+.w+', snippet)
email = ''.join(match_email)
data.append({
'Title': title,
'Link': link,
'Email': email if email else None
})
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"Title": "Revealed: Billboard's 2022 Top Music Lawyers",
"Link": "https://www.billboard.com/wp-content/uploads/2022/03/march-28-2022-billboard-bulletin.pdf",
"Email": "[email protected]"
},
{
"Title": "Folakemi Jegede, LL.B, BL, LLM, ACIS.'s Post - LinkedIn",
"Link": "https://www.linkedin.com/posts/folakemi-jegede-ll-b-bl-llm-acis-855a8a2a_lawyers-law-advocate-activity-6934498515867815936-9R6G?trk=posts_directory",
"Email": "[email protected]"
},
other results ...
]
Also you can use Google Search Engine Results API from SerpApi. It’s a paid API with a free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
import os, json, re
params = {
"engine": "google", # search engine
"q": 'intext:"gmail.com" solicitor bereavement wale', # search query
"api_key": "..." # serpapi key from https://serpapi.com/manage-api-key
}
search = GoogleSearch(params) # where data extraction happens
results = search.get_dict() # JSON -> Python dictionary
data = []
for result in results['organic_results']:
title = result['title']
link = result['link']
snippet = result['snippet']
match_email = re.findall(r'[w.-]+@[w.-]+.w+', snippet)
email = 'n'.join(match_email)
data.append({
'title': title,
'link': link,
'email': email if email else None,
})
print(json.dumps(data, indent=2, ensure_ascii=False))
Output: exactly the same as in the previous solution.