GraphQL API with Scrapy
Question:
I’m trying to get data from https://www.ouedkniss.com/boutiques/immobilier . I found that ouedkniss.com is using GraphQL API. I tried to use this API but failed to pull data and also to paginate. An error is showing. AttributeError: 'list' object has no attribute 'get'
I don’t know if I miss something else here or not. Here is what I tried so far:
import scrapy
import json
from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = json.dumps([
{
"operationName": "SearchStore",
"query": "query Campaign($slug: String!) {n project(slug: $slug) {n idn isSharingProjectBudgetn risksn story(assetWidth: 680)n currencyn spreadsheet {n displayModen publicn urln data {n namen valuen phasen rowNumn __typenamen }n dataLastUpdatedAtn __typenamen }n environmentalCommitments {n idn commitmentCategoryn descriptionn __typenamen }n __typenamen }n}n",
"variables": {
"q": "", "filter": {
"categorySlug": "immobilier",
"count": 12, "page": 1},
"categorySlug": "immobilier",
"count": 12,
"page": 1
},
}
])
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=payload,
callback=self.parse
)
return super().start_requests()
def parse(self, response):
json_resp = json.loads(response.body)
# print(json_resp)
stores = json_resp.get('data')[0].get('stores').get('data')
for store in stores:
loader = ItemLoader(item=OuedknissItem())
loader.add_value('name', store.get('name'))
yield loader.load_item()
Answers:
i think you can’t use ouedkniss Api, because the policy request is allowed just for the origin request as you can see below.
enter image description here
Your payload json data wasn’t well-formatted thats why output was validator errors.Now it’s working fine.
import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = json.dumps({
"operationName":"SearchStore",
"variables":{
"q":"",
"filter":{
"categorySlug":"immobilier",
"count":12,
"page":1
}
},
"query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {n stores: storeSearch(q: $q, filter: $filter) {n data {n idn namen slugn descriptionn imageUrln followerCountn announcementsCountn urln mainLocation {n location {n region {n namen __typenamen }n city {n namen __typenamen }n __typenamen }n __typenamen }n announcements(count: 6, page: 1) {n data {n idn defaultMedia(size: SMALL) {n mediaUrln __typenamen }n __typenamen }n __typenamen }n __typenamen }n paginatorInfo {n lastPagen __typenamen }n __typenamen }n}n"
})
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=payload,
callback=self.parse
)
return super().start_requests()
def parse(self, response):
json_resp = json.loads(response.body)
#print(json_resp)
stores = json_resp.get('data').get('stores').get('data')[0]
print(stores)
# loader = ItemLoader(item=OuedknissItem())
# yield loader.load_item()
Output:
{'id': '7088', 'name': 'Rachid Dounia', 'slug': 'rachid-dounia', 'description': 'agence immobiliere', 'imageUrl': 'https://cdn.ouedkniss.com/stores/7088/Logo.jpg', 'followerCount': 4, 'announcementsCount': 11, 'url': '', 'mainLocation': {'location': {'region': {'name': 'Algiers', '__typename': 'Region'}, 'city': {'name': 'Cheraga', '__typename': 'City'}, '__typename': 'Location'}, '__typename': 'StoreLocation'}, 'announcements': {'data': [{'id': '34036104', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33491623', 'defaultMedia': {'mediaUrl': 'https://cdn9.ouedkniss.com/200/medias/announcements/images/pA6vV/4llx7bXtpjVv8196UOgs3ebpXai5HAYl7rs51MAD.jpg', '__typename': 'AnnouncementMedia'}, '__typename': 'Announcement'}, {'id': '33491551', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '27271413', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33794330', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '32853052', 'defaultMedia': None, '__typename': 'Announcement'}], '__typename': 'AnnouncementPagination'}, '__typename': 'Store'}
2022-12-13 00:09:28 [scrapy.core.engine] INFO: Closing spider (finished)
2022-12-13 00:09:28 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1319,
'downloader/request_count': 1,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 3260,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
Update along with payload pagination:
import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = {
"operationName":"SearchStore",
"variables":{
"q":"",
"filter":{
"categorySlug":"immobilier",
"count":12,
"page": 1
}},
"query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {n stores: storeSearch(q: $q, filter: $filter) {n data {n idn namen slugn descriptionn imageUrln followerCountn announcementsCountn urln mainLocation {n location {n region {n namen __typenamen }n city {n namen __typenamen }n __typenamen }n __typenamen }n announcements(count: 6, page: 1) {n data {n idn defaultMedia(size: SMALL) {n mediaUrln __typenamen }n __typenamen }n __typenamen }n __typenamen }n paginatorInfo {n lastPagen __typenamen }n __typenamen }n}n"
}
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
for payload['variables']['filter']['page'] in range(1,3):
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=json.dumps(payload),
callback=self.parse
)
def parse(self, response):
json_resp = json.loads(response.body)
#print(json_resp)
stores = json_resp['data']['stores']['data']
for store in stores:
yield {
'id':store['id']
}
It looks like you’re not passing the required variables into the query.
You have:
query Campaign($slug: String!) {… result fields}
This query expects a single variable slug
.
Meanwhile your variables are:
"variables": {
"q": "",
"filter": {
"categorySlug": "immobilier",
"count": 12, "page": 1},
"categorySlug": "immobilier",
"count": 12,
"page": 1
},
}
(You have count
and categorySlug
there twice by the way)
Try:
query Campaign($q: String, $filter: StoreSearchFilterInput!) {… result fields}
You should probably check response.ok
to ensure your query succeeded before attempting to parse it.
I’m trying to get data from https://www.ouedkniss.com/boutiques/immobilier . I found that ouedkniss.com is using GraphQL API. I tried to use this API but failed to pull data and also to paginate. An error is showing. AttributeError: 'list' object has no attribute 'get'
I don’t know if I miss something else here or not. Here is what I tried so far:
import scrapy
import json
from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = json.dumps([
{
"operationName": "SearchStore",
"query": "query Campaign($slug: String!) {n project(slug: $slug) {n idn isSharingProjectBudgetn risksn story(assetWidth: 680)n currencyn spreadsheet {n displayModen publicn urln data {n namen valuen phasen rowNumn __typenamen }n dataLastUpdatedAtn __typenamen }n environmentalCommitments {n idn commitmentCategoryn descriptionn __typenamen }n __typenamen }n}n",
"variables": {
"q": "", "filter": {
"categorySlug": "immobilier",
"count": 12, "page": 1},
"categorySlug": "immobilier",
"count": 12,
"page": 1
},
}
])
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=payload,
callback=self.parse
)
return super().start_requests()
def parse(self, response):
json_resp = json.loads(response.body)
# print(json_resp)
stores = json_resp.get('data')[0].get('stores').get('data')
for store in stores:
loader = ItemLoader(item=OuedknissItem())
loader.add_value('name', store.get('name'))
yield loader.load_item()
i think you can’t use ouedkniss Api, because the policy request is allowed just for the origin request as you can see below.
enter image description here
Your payload json data wasn’t well-formatted thats why output was validator errors.Now it’s working fine.
import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = json.dumps({
"operationName":"SearchStore",
"variables":{
"q":"",
"filter":{
"categorySlug":"immobilier",
"count":12,
"page":1
}
},
"query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {n stores: storeSearch(q: $q, filter: $filter) {n data {n idn namen slugn descriptionn imageUrln followerCountn announcementsCountn urln mainLocation {n location {n region {n namen __typenamen }n city {n namen __typenamen }n __typenamen }n __typenamen }n announcements(count: 6, page: 1) {n data {n idn defaultMedia(size: SMALL) {n mediaUrln __typenamen }n __typenamen }n __typenamen }n __typenamen }n paginatorInfo {n lastPagen __typenamen }n __typenamen }n}n"
})
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=payload,
callback=self.parse
)
return super().start_requests()
def parse(self, response):
json_resp = json.loads(response.body)
#print(json_resp)
stores = json_resp.get('data').get('stores').get('data')[0]
print(stores)
# loader = ItemLoader(item=OuedknissItem())
# yield loader.load_item()
Output:
{'id': '7088', 'name': 'Rachid Dounia', 'slug': 'rachid-dounia', 'description': 'agence immobiliere', 'imageUrl': 'https://cdn.ouedkniss.com/stores/7088/Logo.jpg', 'followerCount': 4, 'announcementsCount': 11, 'url': '', 'mainLocation': {'location': {'region': {'name': 'Algiers', '__typename': 'Region'}, 'city': {'name': 'Cheraga', '__typename': 'City'}, '__typename': 'Location'}, '__typename': 'StoreLocation'}, 'announcements': {'data': [{'id': '34036104', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33491623', 'defaultMedia': {'mediaUrl': 'https://cdn9.ouedkniss.com/200/medias/announcements/images/pA6vV/4llx7bXtpjVv8196UOgs3ebpXai5HAYl7rs51MAD.jpg', '__typename': 'AnnouncementMedia'}, '__typename': 'Announcement'}, {'id': '33491551', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '27271413', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33794330', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '32853052', 'defaultMedia': None, '__typename': 'Announcement'}], '__typename': 'AnnouncementPagination'}, '__typename': 'Store'}
2022-12-13 00:09:28 [scrapy.core.engine] INFO: Closing spider (finished)
2022-12-13 00:09:28 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1319,
'downloader/request_count': 1,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 3260,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
Update along with payload pagination:
import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = {
"operationName":"SearchStore",
"variables":{
"q":"",
"filter":{
"categorySlug":"immobilier",
"count":12,
"page": 1
}},
"query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {n stores: storeSearch(q: $q, filter: $filter) {n data {n idn namen slugn descriptionn imageUrln followerCountn announcementsCountn urln mainLocation {n location {n region {n namen __typenamen }n city {n namen __typenamen }n __typenamen }n __typenamen }n announcements(count: 6, page: 1) {n data {n idn defaultMedia(size: SMALL) {n mediaUrln __typenamen }n __typenamen }n __typenamen }n __typenamen }n paginatorInfo {n lastPagen __typenamen }n __typenamen }n}n"
}
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
for payload['variables']['filter']['page'] in range(1,3):
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=json.dumps(payload),
callback=self.parse
)
def parse(self, response):
json_resp = json.loads(response.body)
#print(json_resp)
stores = json_resp['data']['stores']['data']
for store in stores:
yield {
'id':store['id']
}
It looks like you’re not passing the required variables into the query.
You have:
query Campaign($slug: String!) {… result fields}
This query expects a single variable slug
.
Meanwhile your variables are:
"variables": {
"q": "",
"filter": {
"categorySlug": "immobilier",
"count": 12, "page": 1},
"categorySlug": "immobilier",
"count": 12,
"page": 1
},
}
(You have count
and categorySlug
there twice by the way)
Try:
query Campaign($q: String, $filter: StoreSearchFilterInput!) {… result fields}
You should probably check response.ok
to ensure your query succeeded before attempting to parse it.