How to get an HTML response after sending a POST request in Scrapy

Question

I’m writing a web scraper on Python Scrapy, and when I send a POST request, I get json response. How can I get HTML of the page after a request. The thing is when I choose a category in the website it sends a POST request without reloading the page and I need the data after sending a POST request.
My Spider:

import urllib
import scrapy
from scrapy.http import Request
from scrapy.utils.response import open_in_browser


class NonprofitSpider(scrapy.Spider):
    name = 'nonprofit'
    start_urls = ['https://www.guidestar.org/search']


    def parse(self, response):
        
        url = 'https://www.guidestar.org/search/SubmitSearch'

        headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        }

        data = {
            "CurrentPage": "1",
            "SearchType": "org",
            "GroupExemption": "",
            "AffiliateOrgName": "",
            "RelatedOrgName": "",
            "RelatedOrgEin": "",
            "RelationType": "",
            "RelatedOrgs": "",
            "SelectedCityNav[]": "",
            "SelectedCountyNav[]": "",
            "Eins": "",
            "ul": "",
            "PCSSubjectCodes[]": "",
            "PeoplePCSSubjectCodes[]": "",
            "PCSPopulationCodes[]": "",
            "AutoSelectTaxonomyFacet": "",
            "AutoSelectTaxonomyText": "",
            "Keywords": "",
            "State": "Alaska",
            "City": "",
            "PeopleZip": "",
            "PeopleZipRadius": "Zip+Only",
            "PeopleCity": "",
            "PeopleRevenueRangeLow": "$0",
            "PeopleRevenueRangeHigh": "max",
            "PeopleAssetsRangeLow": "$0",
            "PeopleAssetsRangeHigh": "max",
            "Sort": ""
        }

        return Request(
            url=url,
            method='POST',
            headers=headers,
            body=urllib.parse.urlencode(data),
            callback=self.start
        )

    def start(self, response):
        print(response.body) # json, but I need to get html

Asked By: Albert

||

Source

Answer 1

@Albert Here is an example of working solution:

CODE:

import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.keys import Keys
from scrapy.selector import Selector
import time

class AlaskaSpider(scrapy.Spider):

    name = 'alaska'

    def start_requests(self):
        url = 'https://www.guidestar.org/search'
        yield SeleniumRequest(
            url=url,
            wait_time=6,
            callback=self.parse)

    def parse(self, response):
        
        driver = response.meta['driver']
        search_input = driver.find_element_by_xpath('//*[@class="form-control searchButton"]')
        search_input.send_keys('Alaska')
        search_input.send_keys(Keys.ENTER)
        time.sleep(8)
        driver.save_screenshot('search_result.png')


        html = driver.page_source

        resp = Selector(text=html)

        cards = resp.xpath('//*[@id="result-0"]')
        for card in cards:
            yield {
                'EIN': card.xpath('.//*[@class="mb-0"]/text()').get()}
        
        
    def spider_closed(self):
        self.driver.close()

Settings.py file: You have to change the uncomment portion as follows:

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'scrapy_sr.middlewares.ScrapySrSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'scrapy_sr.middlewares.ScrapySrDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'scrapy_sr.pipelines.ScrapySrPipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


# Middleware

DOWNLOADER_MIDDLEWARES = {
    'scrapy_selenium.SeleniumMiddleware': 800
}

# Selenium

SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
# '--headless' if using chrome instead of firefox
SELENIUM_DRIVER_ARGUMENTS = ['--headless']

Output:

{'EIN': '51-0152394'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0155010'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '27-2390076'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0055697'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '83-4051246'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '31-1207314'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-6009991'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-6009764'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '20-2590220'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0073478'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-6001032'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '23-7302803'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '51-0210787'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-6002348'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0155067'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0150993'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0043154'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '30-0854378'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '84-3893461'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '46-1837510'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-1039013'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '14-1958727'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0098901'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '23-7394629'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '81-2318960'}
2021-11-04 18:33:11 [scrapy.core.engine] INFO: Closing spider (finished)
2021-11-04 18:33:11 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://127.0.0.1:59720/session/05a78d4be9af7205aac54abc0b91118b {}
2021-11-04 18:33:11 [urllib3.connectionpool] DEBUG: http://127.0.0.1:59720 "DELETE /session/05a78d4be9af7205aac54abc0b91118b HTTP/1.1" 200 14
2021-11-04 18:33:11 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2021-11-04 18:33:14 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/response_bytes': 203850,
 'downloader/response_count': 1,
 'downloader/response_status_count/200':

Answered By: F.Hoque

How to get an HTML response after sending a POST request in Scrapy

Question:

Answers: