Getting a book genres and language scraped from goodreads using beautifulsoup
Question:
I was trying to scape a book metadat from goodreads but, i can’t get it right for geners and language
here are the code
import re
from bs4 import BeautifulSoup
from argparse import Namespace
from urllib.request import urlopen
import pandas as pd
def get_genres(soup):
genres = []
for node in soup.find_all('div', {'class': 'left'}):
current_genres = node.find_all('a', {'class': 'actionLinkLite bookPageGenreLink'})
current_genre = ' > '.join([g.text for g in current_genres])
if current_genre.strip():
genres.append(current_genre)
return genres
def scrape_book(book_id: str, args: Namespace):
url = "https://www.goodreads.com/book/show/" + book_id
source = urlopen(url)
soup = BeautifulSoup(source, "html.parser")
book = {
"book_id_title": book_id,
"book_id": get_id(book_id),
"book_title": soup.find("h1", {"class": "Text Text__title1", "data-testid":"bookTitle"}).text,
"book_description": soup.find("span", {"class": "Formatted"}).text,
"book_url": url,
"Book_work_pages": soup.find("p", {"data-testid": "pagesFormat"}).text,
"Book_work_published_date": soup.find("p", {"data-testid": "publicationInfo"}).text,
"book_author": soup.find("span", {"class": "ContributorLink__name"}).text,
"book_author_url": soup.find("a", {"class": "ContributorLink"}).text,
"book_language": soup.find("div", {"class": "TruncatedContent" }).text,
"genres": get_genres(soup)
}
return book
scrape_book (book_id='343', args= Namespace)
this is the book page in the above code
https://www.goodreads.com/book/show/343
Answers:
You can try CSS selector a[href*="/genres/"]
: This selects all <a>
tags containing /genres/
in the href=
attribute:
def get_genres(soup):
genres = []
for a in soup.select('a[href*="/genres/"]'):
genres.append(a.text)
return genres
This will select these genres from the URL:
['Fiction', 'Classics', 'Historical Fiction', 'Horror', 'Mystery', 'Thriller', 'Crime']
I was trying to scape a book metadat from goodreads but, i can’t get it right for geners and language
here are the code
import re
from bs4 import BeautifulSoup
from argparse import Namespace
from urllib.request import urlopen
import pandas as pd
def get_genres(soup):
genres = []
for node in soup.find_all('div', {'class': 'left'}):
current_genres = node.find_all('a', {'class': 'actionLinkLite bookPageGenreLink'})
current_genre = ' > '.join([g.text for g in current_genres])
if current_genre.strip():
genres.append(current_genre)
return genres
def scrape_book(book_id: str, args: Namespace):
url = "https://www.goodreads.com/book/show/" + book_id
source = urlopen(url)
soup = BeautifulSoup(source, "html.parser")
book = {
"book_id_title": book_id,
"book_id": get_id(book_id),
"book_title": soup.find("h1", {"class": "Text Text__title1", "data-testid":"bookTitle"}).text,
"book_description": soup.find("span", {"class": "Formatted"}).text,
"book_url": url,
"Book_work_pages": soup.find("p", {"data-testid": "pagesFormat"}).text,
"Book_work_published_date": soup.find("p", {"data-testid": "publicationInfo"}).text,
"book_author": soup.find("span", {"class": "ContributorLink__name"}).text,
"book_author_url": soup.find("a", {"class": "ContributorLink"}).text,
"book_language": soup.find("div", {"class": "TruncatedContent" }).text,
"genres": get_genres(soup)
}
return book
scrape_book (book_id='343', args= Namespace)
this is the book page in the above code
https://www.goodreads.com/book/show/343
You can try CSS selector a[href*="/genres/"]
: This selects all <a>
tags containing /genres/
in the href=
attribute:
def get_genres(soup):
genres = []
for a in soup.select('a[href*="/genres/"]'):
genres.append(a.text)
return genres
This will select these genres from the URL:
['Fiction', 'Classics', 'Historical Fiction', 'Horror', 'Mystery', 'Thriller', 'Crime']