I am web-scraping for product and prices. The output is coming out with characters in between the product and prices how do i remove them
Question:
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("**website name**")
soup = BeautifulSoup(page.content,'html.parser')
books = soup.find('div',{'class':'row justify-content-md-first'})
#print(books)
items = books.find_all(class_='col-12')
#print(items[0].find(class_ ='product_title').get_text())
#print(items[0].find(class_ ='product_price').get_text())
product_titles = [item.find(class_ = 'product_title').get_text() for item in items]
product_prices = [item.find(class_ = 'product_price').get_text() for item in items]
print(product_titles)
#print(product_prices)
product_list = pd.DataFrame(
{'product_title':product_titles,
'product_price': product_prices,
})
print(product_list)
product_list.to_csv('Product.csv')
product_title product_price
0 nrntttttttttAlphabet Dot–To–Dot – ... rnttttttttR65.00rnttttttt
1 nrntttttttttAlphabet Games and Puz... rnttttttttR65.00rnttttttt
2 nrntttttttttAlphabet Hidden Pictur... rnttttttttR65.00rnttttttt
3 nrntttttttttAmazing Mazes – Ages 4... rnttttttttR65.00rnttttttt
4 nrntttttttttEarly Maths – Ages 4–5... rnttttttttR65.00rnttttttt
5 nrntttttttttEarly Reading – Ages 4... rnttttttttR65.00rnttttttt
6 nrntttttttttLearning Centres – Sel... rnttttttttR210.00rnttttttt
7 nrntttttttttLetters and Sounds – A... rnttttttttR65.00rnttttttt
8 nrntttttttttNumbers Dot–To–Dot – A... rnttttttttR64.00rnttttttt
9 nrntttttttttNumbers Fun – Ages 4–5... rnttttttttR65.00rnttttttt
10 nrntttttttttNumbers Hidden Picture... rnttttttttR65.00rnttttttt
11 nrntttttttttPatterns and Sequence ... rnttttttttR65.00rnttttttt
12 nrntttttttttTracing and Cutting – ... rnttttttttR64.00rnttttttt
Answers:
You can use pandas.Series.str.strip()
to remove leading and trailing characters.
product_list = product_list.apply(lambda col: col.str.strip())
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("**website name**")
soup = BeautifulSoup(page.content,'html.parser')
books = soup.find('div',{'class':'row justify-content-md-first'})
#print(books)
items = books.find_all(class_='col-12')
#print(items[0].find(class_ ='product_title').get_text())
#print(items[0].find(class_ ='product_price').get_text())
product_titles = [item.find(class_ = 'product_title').get_text() for item in items]
product_prices = [item.find(class_ = 'product_price').get_text() for item in items]
print(product_titles)
#print(product_prices)
product_list = pd.DataFrame(
{'product_title':product_titles,
'product_price': product_prices,
})
print(product_list)
product_list.to_csv('Product.csv')
product_title product_price
0 nrntttttttttAlphabet Dot–To–Dot – ... rnttttttttR65.00rnttttttt
1 nrntttttttttAlphabet Games and Puz... rnttttttttR65.00rnttttttt
2 nrntttttttttAlphabet Hidden Pictur... rnttttttttR65.00rnttttttt
3 nrntttttttttAmazing Mazes – Ages 4... rnttttttttR65.00rnttttttt
4 nrntttttttttEarly Maths – Ages 4–5... rnttttttttR65.00rnttttttt
5 nrntttttttttEarly Reading – Ages 4... rnttttttttR65.00rnttttttt
6 nrntttttttttLearning Centres – Sel... rnttttttttR210.00rnttttttt
7 nrntttttttttLetters and Sounds – A... rnttttttttR65.00rnttttttt
8 nrntttttttttNumbers Dot–To–Dot – A... rnttttttttR64.00rnttttttt
9 nrntttttttttNumbers Fun – Ages 4–5... rnttttttttR65.00rnttttttt
10 nrntttttttttNumbers Hidden Picture... rnttttttttR65.00rnttttttt
11 nrntttttttttPatterns and Sequence ... rnttttttttR65.00rnttttttt
12 nrntttttttttTracing and Cutting – ... rnttttttttR64.00rnttttttt
You can use pandas.Series.str.strip()
to remove leading and trailing characters.
product_list = product_list.apply(lambda col: col.str.strip())