How to break the crawl when it is on the last page (requests, python)?

Question:

I made a crawl program with requests and I would like to stop it when it is on the last page. Where should I put the break statement to break the loop on the last page? Now it runs but it doesn’t stop on the last page. I attached the program. I will appreciate any help.

import requests
from lxml import html
from time import sleep
import csv

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, sdch, br",
    "Accept-Language": "en-US,en;q=0.8",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}

proxies = {
    'http': 'http://95.167.116.116:8080',
    'https': 'http://88.157.149.250:8080',
}
page_counter = 1
links = []
while True:
        try:
            url = "https://www.amazon.com/s/ref=sr_pg_{0}?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&page={0}&bbn=11444086011&ie=UTF8&qid=1517650207".format(
                page_counter)
            response = requests.get(url, headers=headers, proxies=proxies, stream=True)
            if response.status_code == 200:
                source = html.fromstring(response.content)
                links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
                page_counter += 1
            else:
                break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            sleep(5)
            print("Current page ", page_counter)
            print("Was a nice sleep, now let me continue...")

csvfile = "products.csv"

# Assuming res is a flat list
with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='n')
    for val in links:
        writer.writerow([val])

Asked By: ryy77

||

Answers:

Try this snippet as an example, and then please extend it with your custom functions:

from time import sleep
from urllib.parse import urljoin

import requests
from lxml import html

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, sdch, br",
    "Accept-Language": "en-US,en;q=0.8",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}

proxies = {
    'http': 'http://95.167.116.116:8080',
    'https': 'http://88.157.149.250:8080',
}

links = []
url = 'https://www.amazon.com/s/ref=sr_pg_1?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&bbn=11444086011&ie=UTF8&qid=1517831374'

while True:
    try:
        print('Fetching url [%s]...' % url)
        response = requests.get(url, headers=headers, stream=True)
        if response.status_code == 200:
            source = html.fromstring(response.content)
            links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
            try:
                next_url = source.xpath('//*[@id="pagnNextLink"]/@href')[0]
                url = urljoin('https://www.amazon.com', next_url)
            except IndexError:
                break
    except Exception:
        print("Connection refused by the server..")
        print("Let me sleep for 5 seconds")
        print("ZZzzzz...")
        sleep(5)
        print("Was a nice sleep, now let me continue...")

print(links)

Actually it scrapes the current page for the link of the next page. If the url of the next page can be found, then follows. If cannot be found, then breaks the while loop, and prints the collected links list.

Hope it helps.

Answered By: Szabolcs