Getting a book genres and language scraped from goodreads using beautifulsoup

Question

I was trying to scape a book metadat from goodreads but, i can’t get it right for geners and language
here are the code

import re
from bs4 import BeautifulSoup
from argparse import Namespace
from urllib.request import urlopen
import pandas as pd


def get_genres(soup):
    genres = []
    for node in soup.find_all('div', {'class': 'left'}):
        current_genres = node.find_all('a', {'class': 'actionLinkLite bookPageGenreLink'})
        current_genre = ' > '.join([g.text for g in current_genres])
        if current_genre.strip():
            genres.append(current_genre)
    return genres




def scrape_book(book_id: str, args: Namespace):
    url = "https://www.goodreads.com/book/show/" + book_id
    source = urlopen(url)
    soup = BeautifulSoup(source, "html.parser")

    book = {
        "book_id_title": book_id,
        "book_id": get_id(book_id),
        "book_title": soup.find("h1", {"class": "Text Text__title1", "data-testid":"bookTitle"}).text,
        "book_description": soup.find("span", {"class": "Formatted"}).text,
        "book_url": url,
        "Book_work_pages": soup.find("p", {"data-testid": "pagesFormat"}).text,
        "Book_work_published_date": soup.find("p", {"data-testid": "publicationInfo"}).text,
        "book_author": soup.find("span", {"class": "ContributorLink__name"}).text,
        "book_author_url": soup.find("a", {"class": "ContributorLink"}).text,
        "book_language": soup.find("div", {"class": "TruncatedContent" }).text,
        "genres": get_genres(soup)
        
    }

    return book

scrape_book (book_id='343', args= Namespace)

this is the book page in the above code
https://www.goodreads.com/book/show/343

Asked By: Mohammed Almulla

||

Source

Answer 1

You can try CSS selector a[href*="/genres/"]: This selects all <a> tags containing /genres/ in the href= attribute:

def get_genres(soup):
    genres = []
    for a in soup.select('a[href*="/genres/"]'):
        genres.append(a.text)
    return genres

This will select these genres from the URL:

['Fiction', 'Classics', 'Historical Fiction', 'Horror', 'Mystery', 'Thriller', 'Crime']

Answered By: Andrej Kesely

Getting a book genres and language scraped from goodreads using beautifulsoup

Question:

Answers: