I want to store Image in an excel sheet CSV but giving me this data:image/
Question:
I want to store Image in an excel sheet CSV but giving me this "data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="
instead of image url
class NewsSpider(scrapy.Spider):
name = "articles"
def start_requests(self):
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
Feature_Image =response.xpath('//*[@id="article-wrapper"]/article/section[2]/div/div/div/img//@src').get()
Feature_Image = response.urljoin(Feature_Image)
yield{
'Publication Date': Published_Date,
'Feature_Image': Feature_Image,
'Article Content': Content
}
# =============== Data Store +++++++++++++++++++++
Data = [[Category,Headlines,Author,Source,Published_Date,Feature_Image,Content,url]]
try:
df = pd.DataFrame (Data, columns = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL'])
print(df)
with open('C:/Users/Public/pagedata.csv', 'a') as f:
df.to_csv(f, header=False)
except:
df = pd.DataFrame (Data, columns = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL'])
print(df)
df.to_csv('C:/Users/Public/pagedata.csv', mode='a')
Answers:
-
The image url is absolute url. So no need to make it again absolute url using urljoin()
method which is the main reason not to grab the original image url.
-
Your image url selected xpath expression select only a single image. So get rid of extra forward slash from @src
-
You aren’t getting the right image url because @src select the image url that is your output but the original image url’s attribute is @data-src
Try:
import scrapy
class NewsSpider(scrapy.Spider):
name = "articles"
def start_requests(self):
#https://skift.com/2022/10/08/american-express-travels-rebound-and-other-top-stories-this-week/
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
Feature_Image =response.xpath('//*[@id="article-wrapper"]/article/section[2]/div/div/div/img/@data-src').get()
yield {
#'Publication Date': Published_Date,
'Feature_Image': Feature_Image,
#'Article Content': Content
}
Output:
{'Feature_Image': 'https://skift.com/wp-content/uploads/2022/10/American_Express_office_in_Rome-1-e1665181357253-1024x682.jpg'}
I want to store Image in an excel sheet CSV but giving me this "data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="
instead of image url
class NewsSpider(scrapy.Spider):
name = "articles"
def start_requests(self):
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
Feature_Image =response.xpath('//*[@id="article-wrapper"]/article/section[2]/div/div/div/img//@src').get()
Feature_Image = response.urljoin(Feature_Image)
yield{
'Publication Date': Published_Date,
'Feature_Image': Feature_Image,
'Article Content': Content
}
# =============== Data Store +++++++++++++++++++++
Data = [[Category,Headlines,Author,Source,Published_Date,Feature_Image,Content,url]]
try:
df = pd.DataFrame (Data, columns = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL'])
print(df)
with open('C:/Users/Public/pagedata.csv', 'a') as f:
df.to_csv(f, header=False)
except:
df = pd.DataFrame (Data, columns = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL'])
print(df)
df.to_csv('C:/Users/Public/pagedata.csv', mode='a')
-
The image url is absolute url. So no need to make it again absolute url using
urljoin()
method which is the main reason not to grab the original image url. -
Your image url selected xpath expression select only a single image. So get rid of extra forward slash from @src
-
You aren’t getting the right image url because @src select the image url that is your output but the original image url’s attribute is
@data-src
Try:
import scrapy
class NewsSpider(scrapy.Spider):
name = "articles"
def start_requests(self):
#https://skift.com/2022/10/08/american-express-travels-rebound-and-other-top-stories-this-week/
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
Feature_Image =response.xpath('//*[@id="article-wrapper"]/article/section[2]/div/div/div/img/@data-src').get()
yield {
#'Publication Date': Published_Date,
'Feature_Image': Feature_Image,
#'Article Content': Content
}
Output:
{'Feature_Image': 'https://skift.com/wp-content/uploads/2022/10/American_Express_office_in_Rome-1-e1665181357253-1024x682.jpg'}