Python BS4 Scraper only returning first 9 results from each page

Question:

I got this code set up working as intended – only it’s not quite working as intended… Everything seemed to be going great until I checked my csv output file and noticed that I’m only getting the first 9 results per page. There should be 40 results per page, so I’m getting less that 25% of what I was expecting to get. :

Any thoughts?

import requests
from bs4 import BeautifulSoup
import json
import time
import csv
from random import randint

class ZillowScraper():
    results = []
    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'cookie': 'zguid=23|%24c7bcad1c-8b4d-4a05-851f-792593d217c6; zgsession=1|3c6fcf4a-8614-4f34-8bda-ae3bdf4dfea4; _ga=GA1.2.704623546.1625782868; _gid=GA1.2.1782030485.1625782868; zjs_user_id=null; zjs_anonymous_id=%22c7bcad1c-8b4d-4a05-851f-792593d217c6%22; _gcl_au=1.1.351252561.1625782869; KruxPixel=true; DoubleClickSession=true; _pxvid=cb25d36e-e03a-11eb-84e3-0242ac12000a; _fbp=fb.1.1625782869145.921769265; __pdst=b36b2d1d9b8d4887bd0a555f86fa6715; _pin_unauth=dWlkPVlUWXpNalUxWldVdE9HRmtaUzAwTURjd0xXRmpNVE10Tm1FNVkySm1ZV00zTXpNeA; utag_main=v_id:017a8835deb5009c2affd760e97003073001706b00bd0$_sn:1$_se:1$_ss:1$_st:1625784669690$ses_id:1625782869690%3Bexp-session$_pn:1%3Bexp-session$dcsyncran:1%3Bexp-session$tdsyncran:1%3Bexp-session$dc_visit:1$dc_event:1%3Bexp-session$dc_region:us-east-1%3Bexp-session$ttd_uuid:f3f96136-c4ca-4c87-965e-2b2fc4de4fc3%3Bexp-session; KruxAddition=true; JSESSIONID=F4E2E1E3BA713A9785B729EE23D25B53; _px3=8008679292a31e7d8ef9456a85fe1422b567b72bc9831635f4252e37d74e8f7c:ECQ0UzHRB2JavfWlnUzMXnulfreSDrNddDdFBQVV6DOzCBBDdMiPv19ppZy77slBQhxI5mPRZGEdxA5gzRECnA==:1000:wXTO3Ig/nYxLzR8M0+lxMszX38JV6Uev2W04opdTyfHCE4Dy1SdVfxV55tOAONwNc72ppbH8Hlu/jkd5DO6QQKrZO9yfA3uEGuVjkHrB0YYNZ7NcSd/xNAICGbds9MZxcbm9BoeEC2obtht8ktQPLuNx74Al0F97NIL97W8jrzIzJI+M9O0FCawc2jaYZF03ZLWPg8uzK4o9FjGhRzxl2g==; _uetsid=cbbb1f50e03a11ebbbfe333812066027; _uetvid=cbbbd830e03a11eba87e1953ad00fb35; __gads=ID=c0a8eafd08785626:T=1625782884:S=ALNI_MYzalOP2DP0BK8JMHzWH5kj9trbKA; _gat=1; AWSALB=/eRvKT4TIfSL/mO/jD871gON1ueqTCikeKpcapaQ21/eDUMdqeJqGFR3fItByXhLSr+hrkN/55anRgu9nVeFLSWLlOhGob/6wL9ZUnytUlTY8Cp9ZyZPm7eMHPdS; AWSALBCORS=/eRvKT4TIfSL/mO/jD871gON1ueqTCikeKpcapaQ21/eDUMdqeJqGFR3fItByXhLSr+hrkN/55anRgu9nVeFLSWLlOhGob/6wL9ZUnytUlTY8Cp9ZyZPm7eMHPdS; search=6|1628375133494%7Cregion%3Dorange-county-ca%26rect%3D34.68%252C-116.83%252C32.68%252C-118.83%26disp%3Dmap%26mdm%3Dauto%26pt%3D%26fs%3D1%26fr%3D0%26mmm%3D1%26rs%3D0%26ah%3D0%09%091286%09%09%09%09%09%09',
        'referer': 'https://www.zillow.com/orange-county-ca/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Orange%20County%2C%20CA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-118.27155909375001%2C%22east%22%3A-117.26081690625001%2C%22south%22%3A33.20798771954729%2C%22north%22%3A34.12462559847427%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A1286%2C%22regionType%22%3A4%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%7D',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'sec-ch-ua-mobile': '?0',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    def fetch(self, url, params):
        response = requests.get(url, headers=self.headers, params=params)
        print(response.status_code)
        return response

    def parse(self, response):
        content = BeautifulSoup(response, 'lxml')
        deck = content.find('ul', {'class': 'photo-cards photo-cards_wow photo-cards_short photo-cards_extra-attribution'})
        for card in deck.contents:
            script = card.find('script', {'type': 'application/ld+json'})
            if script:
                script_json = json.loads(script.contents[0])

                self.results.append({
                    'latitude': script_json['geo']['latitude'],
                    'longitude': script_json['geo']['longitude'],
                    'name': script_json['name'],
                    'streetaddress': script_json['address']['streetAddress'],
                    'city': script_json['address']['addressLocality'],
                    'state': script_json['address']['addressRegion'],
                    'zip': script_json['address']['postalCode'],
                    'floorSize': script_json['floorSize']['value'],
                    'url': script_json['url'],
                    'price': card.find('div', {'class': 'list-card-price'}).text
                })

    def to_csv(self):
        with open('zillow.csv', 'w', newline='') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
            writer.writeheader()

            for row in self.results:
                writer.writerow(row)

    def run(self):
        url = 'https://www.zillow.com/homes/for_sale/Orange-County,-CA_rb/'

        for page in range(1,5):
            params = {
                'searchQueryState': '{"pagination":{"currentPage": %s},"usersSearchTerm":"Orange County, CA","mapBounds":{"west":-118.27155909375001,"east":-117.26081690625001,"south":33.20798771954729,"north":34.12462559847427},"regionSelection":[{"regionId":1286,"regionType":4}],"isMapVisible":true,"filterState":{"isAllHomes":{"value":true},"sortSelection":{"value":"globalrelevanceex"}},"isListVisible":true}' %page
            }
            res = self.fetch(url, params)
            self.parse(res.text)
            time.sleep(randint(5,15))
            self.to_csv()

if __name__ == '__main__':
    scraper = ZillowScraper()
    scraper.run()

Asked By: FormidableData

||

Answers:

Be informed that you hold the full responsibility for scraping zillow, This is a technical answer for vision manner as I’ve been warned by site dev before :).

import requests
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache"
}


def main(url):
    with requests.Session() as req:
        req.headers.update(headers)
        req.head('https://www.zillow.com/')
        for item in range(1, 2):
            # item can be used here to loop by refactoring `cat1` to be `cat2` and so on
            params = {
                "searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Orange County, CA","mapBounds":{"west":-118.84559473828126,"east":-116.68678126171876,"south":33.34208982842918,"north":33.99173886991076},"regionSelection":[{"regionId":1286,"regionType":4}],"isMapVisible":true,"filterState":{"isAllHomes":{"value":true},"sortSelection":{"value":"globalrelevanceex"}},"isListVisible":true,"mapZoom":9}',
                "wants": '{"cat1":["mapResults"]}'
            }
            r = req.get(url, params=params)
            df = pd.DataFrame(r.json()['cat1']['searchResults']['mapResults'])
            print(df)
            df.to_csv('data.csv', index=False)


main('https://www.zillow.com/search/GetSearchPageState.htm')

Output:

         zpid       price  ... streetViewMetadataURL  streetViewURL
0    25608235    $990,900  ...                   NaN            NaN
1    25586987  $1,070,100  ...                   NaN            NaN
2    25154858    $681,100  ...                   NaN            NaN
3    25486269    $834,200  ...                   NaN            NaN
4    25762795    $696,900  ...                   NaN            NaN
..        ...         ...  ...                   ...            ...
495  25538170    $975,000  ...                   NaN            NaN
496  25622055    $575,000  ...                   NaN            NaN
497  25657278    $649,900  ...                   NaN            NaN
498  63114426  $1,578,000  ...                   NaN            NaN
499  25643107     $89,900  ...                   NaN            NaN

[500 rows x 40 columns]

You can also try to extract the information this way: By turning the mobile search JSON information into a dictionary:

from bs4 import BeautifulSoup
import requests
import json

URL = "https://www.zillow.com/homes/for_sale/?searchQueryState=%7B%22mapBounds%22%3A%7B%22west%22%3A-120.807715421875%2C%22east%22%3A-119.56626034375%2C%22south%22%3A50.16275172026892%2C%22north%22%3A51.173245251779626%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept": "*/*",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache"
}

# Scrape Housing Data from Zillow For location
request = requests.get(url=URL, headers=HEADERS)
soup = BeautifulSoup(request.text, 'html.parser')

information = soup.find('script', {"data-zrr-shared-data-key": "mobileSearchPageStore"}).text.strip("<!--").strip("-->")
information_dict = json.loads(information)
house_result_dict = information_dict['cat1']['searchResults']['listResults']

for listing in house_result_dict:
    print(f"URL:     {listing['detailUrl']}")
    print(f"Address: {listing['address']}")
    print(f"Space:   {listing['area']}")
    print(f"Price:   {listing['price']}")

print(len(house_result_dict))
Answered By: 0xskar