BeautifulSoup get 'f slp' items from google
Question:
Hi Guys I’m trying to get the citations from a number of papers out of google. This is my code
import urllib
import mechanize
from bs4 import BeautifulSoup
import csv
import os #change directory
import re #for regular expressions
br = mechanize.Browser()
br.set_handle_equiv(False)
br.set_handle_robots(False) # ignore robots
br.addheaders = [('User-agent', 'Firefox')] # [()]
br.open('http://google.com/')
br.select_form(name='f') # Note: select the form named 'f' here
term = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease".replace(" ","+")
br.form['q'] = term # query
data = br.submit()
soup = BeautifulSoup(data)
cite= soup.findAll('div',{'class': 'f slp'})
ref = str(cite[1])
print ref
However I keep getting erorrs. I want the number of citations this paper has.
Answers:
The problem is that there is no citation info on the page you are getting after the form submit, in other words there is no div
s with f slp
class.
You have several options to solve it:
- instead of mechanize, automate it in a real browser with selenium
- use google search api
See also:
- Google Search from a Python App
- GoogleScraper.py – A simple python module to parse google search results.
Hope that helps.
To get the citations from a number of papers out of Google you can use regular expressions highlighting cited by
from the snippet
:
snippet = result.select_one(".lEBKkf").text
cited_by = re.search(r'Cited by (d+)', snippet).group()
In order to collect information from all pages you need to use pagination whith while
loop.
Pagination is possible as long as the next button exists (determined by the presence of a button selector on the page, in our case the CSS selector ".d6cvqb a[id=pnnext]", you need to increase the value of ["start"] by 10 to access the next page, if present, otherwise, we need to exit the while loop:
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, re, lxml
query = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease"
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": query, # query
"hl": "en", # language
"gl": "uk", # country of the search, UK -> United Kingdom
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
page_num = 0
citations = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = result.select_one(".DKV0Md").text
try:
snippet = result.select_one(".lEBKkf").text
except:
snippet = None
try:
cited_by = re.search(r'Cited by (d+)', snippet).group()
except:
cited_by = None
citations.append({
"title": title,
"snippet": snippet,
"cited_by": cited_by
})
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
print(json.dumps(citations, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Targeted therapeutic options and future perspectives for ...",
"snippet": "by J Wang · 2019 · Cited by 238 — Since its launch in 1998, trastuzumab became a therapeutic for breast cancer patients with HER2 overexpression and is widely administrated as ...",
"cited_by": "Cited by 238"
},
{
"title": "Trastuzumab Regimens for HER2-Overexpressing Metastatic ...",
"snippet": "by DR Spigel · 2003 · Cited by 30 — Multinational study of the efficacy and safety of humanized anti-HER2 ... breast cancer that has progressed after chemotherapy for metastatic disease.",
"cited_by": "Cited by 30"
},
other results...
]
Also one of the solutions is to use Google Search Engine Results API from SerpApi. It’s a paid API with the free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
import os, json
query = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease"
params = {
"api_key": "...", # https://serpapi.com/manage-api-key
"device": "desktop", # device
"engine": "google", # serpapi parser engine
"q": query, # query
"gl": "uk", # country of the search, UK -> United Kingdom
"hl": "en" # language
}
search = GoogleSearch(params) # where data extraction happens
pages = search.pagination()
citations = []
for page in pages:
for organic_result in page["organic_results"]:
title = organic_result.get("title")
snippet = organic_result.get("snippet")
cited_by = organic_result.get("rich_snippet", {}).get("top", {}).get("detected_extensions", {}).get("cited_by")
citations.append({
"title": title,
"snippet": snippet,
"cited_by": cited_by
})
print(json.dumps(citations, indent=2))
Output:
[
{
"title": "Targeting Bcl-2 in Herceptin-Resistant Breast Cancer Cell Lines",
"snippet": "recombinant humanized anti-HER2 monoclonal antibody approved for treatment of HER2-overexpressing metastatic breast cancer. Clinical studies have shown that ...",
"cited_by": 71
},
{
"title": "Estabilidad a largo plazo del trastuzumab en plasma y suero ...",
"snippet": "Multinational study of the efficacy and safety of humanized anti-HER2 monoclonal antibody in women who have HER2-overexpressing metastatic breast cancer ...",
"cited_by": 1
}
other results...
]
Hi Guys I’m trying to get the citations from a number of papers out of google. This is my code
import urllib
import mechanize
from bs4 import BeautifulSoup
import csv
import os #change directory
import re #for regular expressions
br = mechanize.Browser()
br.set_handle_equiv(False)
br.set_handle_robots(False) # ignore robots
br.addheaders = [('User-agent', 'Firefox')] # [()]
br.open('http://google.com/')
br.select_form(name='f') # Note: select the form named 'f' here
term = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease".replace(" ","+")
br.form['q'] = term # query
data = br.submit()
soup = BeautifulSoup(data)
cite= soup.findAll('div',{'class': 'f slp'})
ref = str(cite[1])
print ref
However I keep getting erorrs. I want the number of citations this paper has.
The problem is that there is no citation info on the page you are getting after the form submit, in other words there is no div
s with f slp
class.
You have several options to solve it:
- instead of mechanize, automate it in a real browser with selenium
- use google search api
See also:
- Google Search from a Python App
- GoogleScraper.py – A simple python module to parse google search results.
Hope that helps.
To get the citations from a number of papers out of Google you can use regular expressions highlighting cited by
from the snippet
:
snippet = result.select_one(".lEBKkf").text
cited_by = re.search(r'Cited by (d+)', snippet).group()
In order to collect information from all pages you need to use pagination whith while
loop.
Pagination is possible as long as the next button exists (determined by the presence of a button selector on the page, in our case the CSS selector ".d6cvqb a[id=pnnext]", you need to increase the value of ["start"] by 10 to access the next page, if present, otherwise, we need to exit the while loop:
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, re, lxml
query = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease"
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": query, # query
"hl": "en", # language
"gl": "uk", # country of the search, UK -> United Kingdom
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
page_num = 0
citations = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = result.select_one(".DKV0Md").text
try:
snippet = result.select_one(".lEBKkf").text
except:
snippet = None
try:
cited_by = re.search(r'Cited by (d+)', snippet).group()
except:
cited_by = None
citations.append({
"title": title,
"snippet": snippet,
"cited_by": cited_by
})
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
print(json.dumps(citations, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Targeted therapeutic options and future perspectives for ...",
"snippet": "by J Wang · 2019 · Cited by 238 — Since its launch in 1998, trastuzumab became a therapeutic for breast cancer patients with HER2 overexpression and is widely administrated as ...",
"cited_by": "Cited by 238"
},
{
"title": "Trastuzumab Regimens for HER2-Overexpressing Metastatic ...",
"snippet": "by DR Spigel · 2003 · Cited by 30 — Multinational study of the efficacy and safety of humanized anti-HER2 ... breast cancer that has progressed after chemotherapy for metastatic disease.",
"cited_by": "Cited by 30"
},
other results...
]
Also one of the solutions is to use Google Search Engine Results API from SerpApi. It’s a paid API with the free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
import os, json
query = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease"
params = {
"api_key": "...", # https://serpapi.com/manage-api-key
"device": "desktop", # device
"engine": "google", # serpapi parser engine
"q": query, # query
"gl": "uk", # country of the search, UK -> United Kingdom
"hl": "en" # language
}
search = GoogleSearch(params) # where data extraction happens
pages = search.pagination()
citations = []
for page in pages:
for organic_result in page["organic_results"]:
title = organic_result.get("title")
snippet = organic_result.get("snippet")
cited_by = organic_result.get("rich_snippet", {}).get("top", {}).get("detected_extensions", {}).get("cited_by")
citations.append({
"title": title,
"snippet": snippet,
"cited_by": cited_by
})
print(json.dumps(citations, indent=2))
Output:
[
{
"title": "Targeting Bcl-2 in Herceptin-Resistant Breast Cancer Cell Lines",
"snippet": "recombinant humanized anti-HER2 monoclonal antibody approved for treatment of HER2-overexpressing metastatic breast cancer. Clinical studies have shown that ...",
"cited_by": 71
},
{
"title": "Estabilidad a largo plazo del trastuzumab en plasma y suero ...",
"snippet": "Multinational study of the efficacy and safety of humanized anti-HER2 monoclonal antibody in women who have HER2-overexpressing metastatic breast cancer ...",
"cited_by": 1
}
other results...
]