Unable to select the correct div from a webpage
Question:
I am trying to parse song titles from a website, but can’t figure out how to grab the specific div that has them. I’ve tried about a dozen different methods but always get back an empty list.
If you go to the url and inspect one of the youtube videos there, you will find a div with a class of single-post-oembed-youtube-wrapper
. That element also contains the artist and title of the song.
This is my first time attempting to scrape data from a webpage, can someone help me out?
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pprint
from webdriver_manager.chrome import ChromeDriverManager
import sys
html = None
url = 'https://ultimateclassicrock.com/best-rock-songs-2018/'
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver")
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
divs = soup.find_all("div", {"class":"single-post-oembed-youtube-wrapper'"})
#all_songs = browser.find_elements(By.CLASS_NAME, 'single-post-oembed-youtube-wrapper')
#html = all_songs.get_attribute("outerHTML")
pprint.pprint(divs)
browser.close()
Answers:
Please try this:
soup = BeautifulSoup(browser.page_source, 'html.parser')
titles = soup.find_all(".single-post-oembed-youtube-wrapper+div p strong")
This will give you all the titles there
You can also try to retrieve the data directly from the HTML source, thus avoiding Selenium.
import requests
from bs4 import BeautifulSoup
import pandas
url = "https://ultimateclassicrock.com/best-rock-songs-2018/"
res = requests.get(url)
soup = BeautifulSoup(res.content)
results = []
for elem in soup.find_all("strong"):
if "," in elem.text:
results.append(elem.text.split(", "))
df = pd.DataFrame(results, columns=["artist", "song"])
df
Output:
artist song
0 Steve Perry 'Sun Shines Gray'
1 Paul McCartney 'I Don't Know'
2 Judas Priest 'Flamethrower'
3 Ace Frehley 'Rocking With the Boys'
4 Paul Simon 'Questions for the Angels'
...
This is slightly hacky but works with your example.
I don’t think there is such a class div
You can grab all the required data from API
import requests
api_url= 'https://ultimateclassicrock.com/rest/carbon/api/menu/category/album-reviews/'
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
data=[]
res=requests.get(api_url,headers=headers)
#print(res)
for item in res.json()['widgets']['dataDetails'].values():
title = item['data']['mainData']['title']
Output:
Reissue Roundup: Summer Sets From Blondie, Lou Reed and More
Todd Rundgren, 'Space Force': Album Review
Pink Floyd, 'Animals (2018 Remix)': Album Review
Sammy Hagar and the Circle, 'Crazy Times': Album Review
Ringo Starr, 'EP3': Album Review
Billy Idol, 'The Cage EP': Album Review
Beatles, 'Revolver Special Edition (Super Deluxe)': Album Review
Richard Marx, 'Songwriter': Album Review
The Cult, 'Under the Midnight Sun': Album Review
Various, 'Here It Is: A Tribute to Leonard Cohen': Album Review
Red Hot Chili Peppers, 'Return of the Dream Canteen': Review
Skid Row, 'The Gang's All Here': Album Review
I am trying to parse song titles from a website, but can’t figure out how to grab the specific div that has them. I’ve tried about a dozen different methods but always get back an empty list.
If you go to the url and inspect one of the youtube videos there, you will find a div with a class of single-post-oembed-youtube-wrapper
. That element also contains the artist and title of the song.
This is my first time attempting to scrape data from a webpage, can someone help me out?
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pprint
from webdriver_manager.chrome import ChromeDriverManager
import sys
html = None
url = 'https://ultimateclassicrock.com/best-rock-songs-2018/'
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver")
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
divs = soup.find_all("div", {"class":"single-post-oembed-youtube-wrapper'"})
#all_songs = browser.find_elements(By.CLASS_NAME, 'single-post-oembed-youtube-wrapper')
#html = all_songs.get_attribute("outerHTML")
pprint.pprint(divs)
browser.close()
Please try this:
soup = BeautifulSoup(browser.page_source, 'html.parser')
titles = soup.find_all(".single-post-oembed-youtube-wrapper+div p strong")
This will give you all the titles there
You can also try to retrieve the data directly from the HTML source, thus avoiding Selenium.
import requests
from bs4 import BeautifulSoup
import pandas
url = "https://ultimateclassicrock.com/best-rock-songs-2018/"
res = requests.get(url)
soup = BeautifulSoup(res.content)
results = []
for elem in soup.find_all("strong"):
if "," in elem.text:
results.append(elem.text.split(", "))
df = pd.DataFrame(results, columns=["artist", "song"])
df
Output:
artist song
0 Steve Perry 'Sun Shines Gray'
1 Paul McCartney 'I Don't Know'
2 Judas Priest 'Flamethrower'
3 Ace Frehley 'Rocking With the Boys'
4 Paul Simon 'Questions for the Angels'
...
This is slightly hacky but works with your example.
I don’t think there is such a class div
You can grab all the required data from API
import requests
api_url= 'https://ultimateclassicrock.com/rest/carbon/api/menu/category/album-reviews/'
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
data=[]
res=requests.get(api_url,headers=headers)
#print(res)
for item in res.json()['widgets']['dataDetails'].values():
title = item['data']['mainData']['title']
Output:
Reissue Roundup: Summer Sets From Blondie, Lou Reed and More
Todd Rundgren, 'Space Force': Album Review
Pink Floyd, 'Animals (2018 Remix)': Album Review
Sammy Hagar and the Circle, 'Crazy Times': Album Review
Ringo Starr, 'EP3': Album Review
Billy Idol, 'The Cage EP': Album Review
Beatles, 'Revolver Special Edition (Super Deluxe)': Album Review
Richard Marx, 'Songwriter': Album Review
The Cult, 'Under the Midnight Sun': Album Review
Various, 'Here It Is: A Tribute to Leonard Cohen': Album Review
Red Hot Chili Peppers, 'Return of the Dream Canteen': Review
Skid Row, 'The Gang's All Here': Album Review