python Scrapy framework adding to my code proxy

Question

I am trying new feature for myself as adding proxy port to my python scraper code.

I took free proxy from this site, and looked for an answer from SO. With help of user @dskrypa I changed in my code meta={'proxy':'103.42.162.50:8080'}

Now it gives an error which continues all along if I do not stop the code run.

File "C:UsersAdminAppDataLocalProgramsPythonPython310libsite-packagesscrapycoredownloaderhandlershttp11.py", line 279, in _get_agent
    proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(proxy)
  File "C:UsersAdminAppDataLocalProgramsPythonPython310libsite-packagesscrapycoredownloaderwebclient.py", line 39, in _parse
    return _parsed_url_args(parsed)
  File "C:UsersAdminAppDataLocalProgramsPythonPython310libsite-packagesscrapycoredownloaderwebclient.py", line 20, in _parsed_url_args
    host = to_bytes(parsed.hostname, encoding="ascii")
  File "C:UsersAdminAppDataLocalProgramsPythonPython310libsite-packagesscrapyutilspython.py", line 108, in to_bytes
    raise TypeError('to_bytes must receive a str or bytes '
TypeError: to_bytes must receive a str or bytes object, got NoneType
2023-03-12 02:47:32 [scrapy.core.scraper] ERROR: Error downloading <GET https://dvlaregistrations.dvla.gov.uk/search/results.html?search=N11CKY&action=index&pricefrom=0&priceto=&prefixmatches=&currentmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto=>

Here is my code;

import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import scrapy_xlsx

itemList=[]
class plateScraper(scrapy.Spider):
    name = 'scrapePlate'
    allowed_domains = ['dvlaregistrations.dvla.gov.uk']
    FEED_EXPORTERS = {'xlsx': 'scrapy_xlsx.XlsxItemExporter'}
    custom_settings = {'FEED_EXPORTERS' :FEED_EXPORTERS,'FEED_FORMAT': 'xlsx','FEED_URI': 'output_r00.xlsx', 'LOG_LEVEL':'INFO','DOWNLOAD_DELAY': 0}
    DOWNLOADER_MIDDLEWARES = {
        'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 1
    }

    def start_requests(self):
        df=pd.read_excel('data.xlsx')
        columnA_values=df['PLATE']
        for row in columnA_values:
            global  plate_num_xlsx
            plate_num_xlsx=row
            base_url =f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num_xlsx}&action=index&pricefrom=0&priceto=&prefixmatches=&currentmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
            url=base_url
            yield scrapy.Request(url,callback=self.parse, cb_kwargs={'plate_num_xlsx': plate_num_xlsx},meta={'proxy':'103.42.162.50:8080'})

    def parse(self, response, plate_num_xlsx=None):
        plate = response.xpath('//div[@class="resultsstrip"]/a/text()').extract_first()
        price = response.xpath('//div[@class="resultsstrip"]/p/text()').extract_first()

        try:
            a = plate.replace(" ", "").strip()
            if plate_num_xlsx == plate.replace(" ", "").strip():
                item = {"plate": plate_num_xlsx, "price": price.strip()}
                itemList.append(item)
                print(item)
                yield item
            else:
                item = {"plate": plate_num_xlsx, "price": "-"}
                itemList.append(item)
                print(item)
                yield item
        except:
            item = {"plate": plate_num_xlsx, "price": "-"}
            itemList.append(item)
            print(item)
            yield item

process = CrawlerProcess()
process.crawl(plateScraper)
process.start()

import winsound
winsound.Beep(555,333)

Asked By: xlmaster

||

Source

Answer 1

you should include the protocol in the proxy url:

meta={"proxy": "http://103.42.162.50:8080"}

Answered By: zaki98

python Scrapy framework adding to my code proxy

Question:

Answers: