I'm only getting the latest date as dataframe, rather than all…?
Question:
I’m using the below code to scrape several tables across multiple dates. The code works well, but the output when saving to CSV only stores the latest data as a dataframe.
What am I missing here in order to union / concatenate all these scraped dates into one dataframe?
##### import packages #######
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
#############################
years = ["2023"]
weeks = list(range(1, 5)) # Weeks should probably range from 1 to 53, since rarely there can be 53 instead of 52 weeks in a year
urls = [f"https://www.debestseller60.nl/{year}{week:02}#top" for year in years for week in weeks]
### set user-agent ####
## response = requests.get(url,headers={'user-agent':'Mozilla/5.0'})
for url in urls:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
# Your scraping logic here
data = []
cards = soup.find_all("div", class_="card")
for book in cards:
author = book.find("div", class_="card__author").text.strip() if book.find("div", class_="card__author") else None
title = book.find("div", class_="card__title heading-2 mb-2 clickable").text.strip() if book.find("div", class_="card__title heading-2 mb-2 clickable") else None
Hebban_link = book.find("y-use", class_="History").text.strip() if book.find("y-use", class_="History") else None
for card in cards:
tags_div = card.find('div', class_='card__tags')
if tags_div is not None:
tags = tags_div.find_all('div', class_='card__tags__tag')
if len(tags) >= 4:
tag = tags[-2].text.strip()
else:
tag = tags[-1].text.strip()
else:
tag = "No tags found for this card"
# find the div element with the class "weeks__week week week--active"
active_week_div = soup.find('div', class_='weeks__week week week--active')
# print the text content of the div element
print(active_week_div.text.strip())
#append data
data.append({"Author": author, "Title": title, "Hebban_link": Hebban_link,"ISBN": tag, "week_year":active_week_div.text.strip()})
#convert to dataframe
df = pd.DataFrame(data)
## add rank
df["rank"] = df.index + 1
print(df)
data.append(df)
else:
print(f"Failed to fetch data for URL: {url}")
#converted a file to csv
pd.concat(data).to_csv("top60.csv", encoding='utf-8', index=False)
Answers:
You are overwriting df
at each iteration of your loop.
You should append
df
to a list at each iteration:
dfs = []
for url in urls:
# ...
data = []
for book in cards:
# ...
data.append({"Author": author, "Title": title, "Hebban_link": Hebban_link,"ISBN": tag, "week_year":active_week_div.text.strip()})
dfs.append(pd.DataFrame(data))
And in the end, concat
all the DataFrames:
pd.concat(dfs).to_csv("top60.csv", encoding='utf-8', index=False)
Output (as DataFrame):
Author Title Hebban_link ISBN week_year
0 Dirk de Wachter Vertroostingen None ISBN 9789026346897 week 1 - 2023
1 Charlotte Labee Overprikkeld brein None ISBN 9789026346897 week 1 - 2023
2 Israel van Dorsten Wij waren, ik ben None ISBN 9789026346897 week 1 - 2023
3 Delia Owens Daar waar de rivierkreeften zingen None ISBN 9789026346897 week 1 - 2023
4 Kluun Help, ik heb een puber! None ISBN 9789026346897 week 1 - 2023
.. ... ... ... ... ...
55 Raynor Winn Landlijnen None ISBN 9789043924009 week 4 - 2023
56 Michael Ende Momo en de tijdspaarders None ISBN 9789043924009 week 4 - 2023
57 Arjen van Veelen Rotterdam None ISBN 9789043924009 week 4 - 2023
58 Jutta Chorus Alma's dochters None ISBN 9789043924009 week 4 - 2023
59 Jamie Oliver Jamie Oliver - EEN None ISBN 9789043924009 week 4 - 2023
[240 rows x 5 columns]
I’m using the below code to scrape several tables across multiple dates. The code works well, but the output when saving to CSV only stores the latest data as a dataframe.
What am I missing here in order to union / concatenate all these scraped dates into one dataframe?
##### import packages #######
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
#############################
years = ["2023"]
weeks = list(range(1, 5)) # Weeks should probably range from 1 to 53, since rarely there can be 53 instead of 52 weeks in a year
urls = [f"https://www.debestseller60.nl/{year}{week:02}#top" for year in years for week in weeks]
### set user-agent ####
## response = requests.get(url,headers={'user-agent':'Mozilla/5.0'})
for url in urls:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
# Your scraping logic here
data = []
cards = soup.find_all("div", class_="card")
for book in cards:
author = book.find("div", class_="card__author").text.strip() if book.find("div", class_="card__author") else None
title = book.find("div", class_="card__title heading-2 mb-2 clickable").text.strip() if book.find("div", class_="card__title heading-2 mb-2 clickable") else None
Hebban_link = book.find("y-use", class_="History").text.strip() if book.find("y-use", class_="History") else None
for card in cards:
tags_div = card.find('div', class_='card__tags')
if tags_div is not None:
tags = tags_div.find_all('div', class_='card__tags__tag')
if len(tags) >= 4:
tag = tags[-2].text.strip()
else:
tag = tags[-1].text.strip()
else:
tag = "No tags found for this card"
# find the div element with the class "weeks__week week week--active"
active_week_div = soup.find('div', class_='weeks__week week week--active')
# print the text content of the div element
print(active_week_div.text.strip())
#append data
data.append({"Author": author, "Title": title, "Hebban_link": Hebban_link,"ISBN": tag, "week_year":active_week_div.text.strip()})
#convert to dataframe
df = pd.DataFrame(data)
## add rank
df["rank"] = df.index + 1
print(df)
data.append(df)
else:
print(f"Failed to fetch data for URL: {url}")
#converted a file to csv
pd.concat(data).to_csv("top60.csv", encoding='utf-8', index=False)
You are overwriting df
at each iteration of your loop.
You should append
df
to a list at each iteration:
dfs = []
for url in urls:
# ...
data = []
for book in cards:
# ...
data.append({"Author": author, "Title": title, "Hebban_link": Hebban_link,"ISBN": tag, "week_year":active_week_div.text.strip()})
dfs.append(pd.DataFrame(data))
And in the end, concat
all the DataFrames:
pd.concat(dfs).to_csv("top60.csv", encoding='utf-8', index=False)
Output (as DataFrame):
Author Title Hebban_link ISBN week_year
0 Dirk de Wachter Vertroostingen None ISBN 9789026346897 week 1 - 2023
1 Charlotte Labee Overprikkeld brein None ISBN 9789026346897 week 1 - 2023
2 Israel van Dorsten Wij waren, ik ben None ISBN 9789026346897 week 1 - 2023
3 Delia Owens Daar waar de rivierkreeften zingen None ISBN 9789026346897 week 1 - 2023
4 Kluun Help, ik heb een puber! None ISBN 9789026346897 week 1 - 2023
.. ... ... ... ... ...
55 Raynor Winn Landlijnen None ISBN 9789043924009 week 4 - 2023
56 Michael Ende Momo en de tijdspaarders None ISBN 9789043924009 week 4 - 2023
57 Arjen van Veelen Rotterdam None ISBN 9789043924009 week 4 - 2023
58 Jutta Chorus Alma's dochters None ISBN 9789043924009 week 4 - 2023
59 Jamie Oliver Jamie Oliver - EEN None ISBN 9789043924009 week 4 - 2023
[240 rows x 5 columns]