GraphQL API with Scrapy

Question:

I’m trying to get data from https://www.ouedkniss.com/boutiques/immobilier . I found that ouedkniss.com is using GraphQL API. I tried to use this API but failed to pull data and also to paginate. An error is showing. AttributeError: 'list' object has no attribute 'get' I don’t know if I miss something else here or not. Here is what I tried so far:

import scrapy
import json
from ..items import OuedknissItem
from scrapy.loader import ItemLoader

class StoresSpider(scrapy.Spider):
    name = 'stores'
    allowed_domains = ['www.ouedkniss.com']
  

    def start_requests(self):
        payload = json.dumps([
        {
        "operationName": "SearchStore",
        "query": "query Campaign($slug: String!) {n  project(slug: $slug) {n    idn    isSharingProjectBudgetn    risksn    story(assetWidth: 680)n    currencyn    spreadsheet {n      displayModen      publicn      urln      data {n        namen        valuen        phasen        rowNumn        __typenamen      }n      dataLastUpdatedAtn      __typenamen    }n    environmentalCommitments {n      idn      commitmentCategoryn      descriptionn      __typenamen    }n    __typenamen  }n}n",
        "variables": {
            "q": "", "filter": {
            "categorySlug": "immobilier", 
            "count": 12, "page": 1},
            "categorySlug": "immobilier",
            "count": 12,
            "page": 1
        },
        
        }
        ])
        headers= {
            "Content-Type": "application/json",
            # "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
            }
        yield scrapy.Request(
            url='https://api.ouedkniss.com/graphql',
            method="POST",
            headers=headers,
            body=payload,
            callback=self.parse
            )
        return super().start_requests()

    def parse(self, response):
        json_resp = json.loads(response.body)
        # print(json_resp)
        stores = json_resp.get('data')[0].get('stores').get('data')
        for store in stores:
            loader = ItemLoader(item=OuedknissItem())
            loader.add_value('name', store.get('name'))
            yield loader.load_item()
Asked By: Raisul Islam

||

Answers:

i think you can’t use ouedkniss Api, because the policy request is allowed just for the origin request as you can see below.
enter image description here

Answered By: yacine malki

Your payload json data wasn’t well-formatted thats why output was validator errors.Now it’s working fine.

import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader

class StoresSpider(scrapy.Spider):
    name = 'stores'
    allowed_domains = ['www.ouedkniss.com']
  

    def start_requests(self):
        payload = json.dumps({
   "operationName":"SearchStore",
   "variables":{
      "q":"",
      "filter":{
         "categorySlug":"immobilier",
         "count":12,
         "page":1
      }
   },
   "query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {n  stores: storeSearch(q: $q, filter: $filter) {n    data {n      idn      namen      slugn      descriptionn      imageUrln      followerCountn      announcementsCountn      urln      mainLocation {n        location {n          region {n            namen            __typenamen          }n          city {n            namen            __typenamen          }n          __typenamen        }n        __typenamen      }n      announcements(count: 6, page: 1) {n        data {n          idn          defaultMedia(size: SMALL) {n            mediaUrln            __typenamen          }n          __typenamen        }n        __typenamen      }n      __typenamen    }n    paginatorInfo {n      lastPagen      __typenamen    }n    __typenamen  }n}n"
})
        headers= {
            "Content-Type": "application/json",
            # "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
            }
        yield scrapy.Request(
            url='https://api.ouedkniss.com/graphql',
            method="POST",
            headers=headers,
            body=payload,
            callback=self.parse
            )
        return super().start_requests()

    def parse(self, response):
        json_resp = json.loads(response.body)
        #print(json_resp)
        
        stores = json_resp.get('data').get('stores').get('data')[0]
        print(stores)
        # loader = ItemLoader(item=OuedknissItem())
        # yield loader.load_item()

Output:

{'id': '7088', 'name': 'Rachid Dounia', 'slug': 'rachid-dounia', 'description': 'agence immobiliere', 'imageUrl': 'https://cdn.ouedkniss.com/stores/7088/Logo.jpg', 'followerCount': 4, 'announcementsCount': 11, 'url': '', 'mainLocation': {'location': {'region': {'name': 'Algiers', '__typename': 'Region'}, 'city': {'name': 'Cheraga', '__typename': 'City'}, '__typename': 'Location'}, '__typename': 'StoreLocation'}, 'announcements': {'data': [{'id': '34036104', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33491623', 'defaultMedia': {'mediaUrl': 'https://cdn9.ouedkniss.com/200/medias/announcements/images/pA6vV/4llx7bXtpjVv8196UOgs3ebpXai5HAYl7rs51MAD.jpg', '__typename': 'AnnouncementMedia'}, '__typename': 'Announcement'}, {'id': '33491551', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '27271413', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33794330', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '32853052', 'defaultMedia': None, '__typename': 'Announcement'}], '__typename': 'AnnouncementPagination'}, '__typename': 'Store'}
2022-12-13 00:09:28 [scrapy.core.engine] INFO: Closing spider (finished)
2022-12-13 00:09:28 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1319,
 'downloader/request_count': 1,
 'downloader/request_method_count/POST': 1,
 'downloader/response_bytes': 3260,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,

Update along with payload pagination:

import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader

class StoresSpider(scrapy.Spider):
    name = 'stores'
    allowed_domains = ['www.ouedkniss.com']
  

    def start_requests(self):
        payload = {
            "operationName":"SearchStore",
            "variables":{
                "q":"",
                "filter":{
                "categorySlug":"immobilier",
                "count":12,
                "page": 1
                }},
            "query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {n  stores: storeSearch(q: $q, filter: $filter) {n    data {n      idn      namen      slugn      descriptionn      imageUrln      followerCountn      announcementsCountn      urln      mainLocation {n        location {n          region {n            namen            __typenamen          }n          city {n            namen            __typenamen          }n          __typenamen        }n        __typenamen      }n      announcements(count: 6, page: 1) {n        data {n          idn          defaultMedia(size: SMALL) {n            mediaUrln            __typenamen          }n          __typenamen        }n        __typenamen      }n      __typenamen    }n    paginatorInfo {n      lastPagen      __typenamen    }n    __typenamen  }n}n"
            }
        headers= {
            "Content-Type": "application/json",
            # "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
            }
       
        for payload['variables']['filter']['page'] in range(1,3):
            yield scrapy.Request(
                url='https://api.ouedkniss.com/graphql',
                method="POST",
                headers=headers,
                body=json.dumps(payload),
                callback=self.parse
                )
       
    

    def parse(self, response):
        json_resp = json.loads(response.body)
        #print(json_resp)
        
        stores = json_resp['data']['stores']['data']
        for store in stores:
            yield {
                'id':store['id']
            }
       
Answered By: Fazlul

It looks like you’re not passing the required variables into the query.

You have:

query Campaign($slug: String!) {… result fields}

This query expects a single variable slug.

Meanwhile your variables are:

"variables": {
  "q": "", 
  "filter": {
    "categorySlug": "immobilier", 
    "count": 12, "page": 1},
    "categorySlug": "immobilier",
    "count": 12,
    "page": 1
   },
}

(You have count and categorySlug there twice by the way)

Try:

query Campaign($q: String, $filter: StoreSearchFilterInput!) {… result fields}

You should probably check response.ok to ensure your query succeeded before attempting to parse it.

Answered By: Michel Floyd