Unable to scrape items using scrapy [solved]

Question:

I am trying to webscrape the name, price, and description of products listed on an online shop. The website link is https://eshop.nomin.mn/n-foods.html

enter image description here

When I look through the HTML code of the page, I get the relevant div class containers but when I reference it in my code as such, I get no values when I run my spider. I think one reason would be if the website is Javascript based and is dynamic which would require me to use Splash. However, I don’t think this is the case for my issue.

def parse(self, response, **kwargs):
    cards = response.xpath('//div[@class="item-itemmainroot-1lZ"]')

    # parse details
    for card in cards:
        price = card.xpath(".//a[contains(@class, 'item-nameLenght-K5Z item-name-3TH')]/span()/text()").extract()

Full Code:

import scrapy
import re


class TempSpider(scrapy.Spider):
    name = 'temp_spider'
    allowed_domains = ['https://eshop.nomin.mn/']
    start_urls = ['https://eshop.nomin.mn/n-foods.html']

    def parse(self, response, **kwargs):
        cards = response.xpath('//div[@class="item-itemmainroot-1lZ"]')

        # parse details
        for card in cards:
            price = card.xpath(".//a[contains(@class, 'item-nameLenght-K5Z item-name-3TH')]/span()/text()").extract()
            
            item = {'price': price
                    }
            yield item


  [1]: https://i.stack.imgur.com/iokmo.png

All and any help is greatly appreciated. I can’t seem to figure out what I am doing wrong.

Asked By: Sod

||

Answers:

Use .

import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs

url =  'https://eshop.nomin.mn/graphql?query=query+category%28%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+brand+salable_qty+brand_name+c21_available+c21_business_type+c21_reference+c21_street+c21_area+c21_bed_room+mp_daily_deal%7Bcreated_at+date_from+date_to+deal_id+deal_price+remaining_time+deal_qty+discount_label+is_featured+product_id+product_name+product_sku+sale_qty+status+store_ids+updated_at+__typename%7Dnew_to_date+short_description%7Bhtml+__typename%7DproductAttributes%7Bname+value+__typename%7Dprice%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dspecial_price+special_to_date+thumbnail%7Bfile_small+url+__typename%7Durl_key+url_suffix+mp_label_data%7Benabled+name+priority+label_template+label_image+to_date+__typename%7D...on+ConfigurableProduct%7Bvariants%7Bproduct%7Bsku+special_price+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7D__typename%7D__typename%7D__typename%7D__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=category&variables=%7B%22currentPage%22%3A1%2C%22id%22%3A24175%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22in%22%3A%2224175%22%7D%7D%2C%22pageSize%22%3A50%2C%22sort%22%3A%7B%22position%22%3A%22DESC%22%7D%7D'



 r = requests.get(url)
json_data = json.loads(r.text)
data_docs = json_data['data']['products']['items']

df = pd.DataFrame.from_dict(data_docs)
print(df)
Answered By: Elkhan

Use the websites data api instead of the website url that you visit in your browser. It will return a json object that has all the information you are looking for.

import scrapy
import re


class TempSpider(scrapy.Spider):
    name = 'temp_spider'
    allowed_domains = ['https://eshop.nomin.mn/']
    start_urls = ['https://eshop.nomin.mn/graphql?query=query+category($pageSize:Int!$currentPage:Int!$filters:ProductAttributeFilterInput!$sort:ProductAttributeSortInput){products(pageSize:$pageSize+currentPage:$currentPage+filter:$filters+sort:$sort){items{id+name+sku+brand+salable_qty+brand_name+c21_available+c21_business_type+c21_reference+c21_street+c21_area+c21_bed_room+mp_daily_deal{created_at+date_from+date_to+deal_id+deal_price+remaining_time+deal_qty+discount_label+is_featured+product_id+product_name+product_sku+sale_qty+status+store_ids+updated_at+__typename}new_to_date+short_description{html+__typename}productAttributes{name+value+__typename}price{regularPrice{amount{currency+value+__typename}__typename}__typename}special_price+special_to_date+thumbnail{file_small+url+__typename}url_key+url_suffix+mp_label_data{enabled+name+priority+label_template+label_image+to_date+__typename}...on+ConfigurableProduct{variants{product{sku+special_price+price{regularPrice{amount{currency+value+__typename}__typename}__typename}__typename}__typename}__typename}__typename}page_info{total_pages+__typename}total_count+__typename}}&operationName=category&variables={"currentPage":1,"id":24175,"filters":{"category_id":{"in":"24175"}},"pageSize":50,"sort":{"position":"DESC"}}']

    def parse(self, response, **kwargs):
        data = response.json()
        print(data.keys())
        for item in data['data']["products"]["items"]:
            yield {
                "name": item["name"],
                "price": item["price"]["regularPrice"]["amount"]["value"]
            }

Partial OUTPUT


{'name': 'Хиам Аялал кг', 'price': 19559}
{'name': 'Чихэр Княжеские 1кг', 'price': 24859}
{'name': 'Жимсний чанамал Mr', 'price': 11999}
{'name': 'Vit C ', 'price': 28799}
{'name': 'Жүүс Моя семья', 'price': 3629}
{'name': 'Муурны ялгадас шингээх', 'price': 31999}
{'name': 'Компот Vidan 920гр', 'price': 8879}
{'name': 'Мөс 0.5кг 024218', 'price': 2029}
{'name': 'Өргөст хэмх Hainich', 'price': 7799}
{'name': 'Соус чилитэй 215гр', 'price': 9499}
{'name': 'Цай Ottogi улаан', 'price': 14299}
{'name': 'Цай шингэн Pfanner', 'price': 9379}
{'name': '02381088', 'price': 3179}
{'name': 'Өглөөний хоол G&G', 'price': 8239}
{'name': '02S003167', 'price': 7699}
{'name': '02S003133', 'price': 8299}
{'name': 'Кофе Жокей империал', 'price': 14279}
{'name': 'Жүүс Pfanner orange', 'price': 13129}
{'name': 'Цуу улаан дарсны', 'price': 6939}
{'name': 'Оливын тос Borges', 'price': 14749}
{'name': 'Оливын тос classic', 'price': 33629}
{'name': 'Оливын тос Borges', 'price': 18629}
{'name': 'Гоймон Borges Fusilli', 'price': 5939}
{'name': 'Цай шингэн чавганы', 'price': 2469}
{'name': 'Гоймон Нүүдэл 500гр', 'price': 3759}
{'name': 'Муурны хоол 85гр', 'price': 1889}
{'name': 'Бэлэн Карри зөөлөн', 'price': 7499}
{'name': 'Цай Dr.Baatar 2гр*16ш', 'price': 11999}
{'name': 'Нухаш Urbanek ', 'price': 6979}
{'name': 'Вандуй лууван холимог', 'price': 5899}
{'name': 'Өргөст хэмх Bagro', 'price': 13499}
{'name': 'Бэлэн хоол Samyang', 'price': 6189}
{'name': 'Жүүс Naturalis apple', 'price': 1589}
{'name': 'Жүүс Naturalis Apple-grape', 'price': 5999}
{'name': 'Жүүс Naturalis Apple-sour', 'price': 5999}
{'name': 'Жүүс Vita Pomegranate', 'price': 3659}
{'name': 'Шоколад Luna 33гр', 'price': 1499}
{'name': 'Жүүс Фруктовый Сад', 'price': 5999}
{'name': 'Жүүс Фруктовый Сад', 'price': 5299}
{'name': 'Жүүс Фруктовый Сад', 'price': 5299}
{'name': 'Жүүс Фруктовый Сад', 'price': 5299}
{'name': 'Жүүс Фруктовый Сад', 'price': 5299}

You can find the url for the api in the network tab in your browsers devtools… enter image description here

Answered By: Alexander