I am trying to navigate through the pages of a website and scrape its links but the same page data is scraped even after changing page number

Question

from bs4 import BeautifulSoup
import requests
import pymongo


def traverse_source():


    article_links = []

    for pgindx in range(9):

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            "path": f"issue/S0196-0644(21)X0012-1?pageStart={pgindx}",
            "Sec-fetch-site": "same-origin",
            }
        

        source_url = ""

        source_data = requests.get(source_url,headers = headers)

        print(source_data.headers)
    
        source_url = None

        source_soup = BeautifulSoup(source_data.content,"html.parser")

        destination = source_soup.find_all("h3",attrs = {'class': 'toc__item__title' })

        for dest in destination:
            
            try:
                article_links.append("https://www.annemergmed.com"+dest.a['href'])
            except:
                pass 
        
        source_soup = None

        print(article_links)


if __name__ == "__main__":
    traverse_source()

Here even after incrementing the page number in the URL, the content of the first webpage is always scraped. I tried navigating through the pages using GET method (changing the URL) but still even after changing the source url, it is still scraping the data of page number 1

Asked By: Pradeep Karthik Muthusamy

||

Source

Answer 1

This is one way of scraping that data:

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)

big_list = []
for x in tqdm(range(9)):
    r = s.get(f'https://www.annemergmed.com/issue/S0196-0644(21)X0012-1?pageStart={x}')
    soup = BeautifulSoup(r.text, 'html.parser')
    titles = soup.select('div.articleCitation')
    for t in titles:
        url = t.select_one('h3 a').get('href')        
        header = t.select_one('h3 a').text
        try:
            authors = t.select_one('ul.toc__item__authors').get_text(strip=True)
        except Exception as e:
            authors = 'Unknown'
        big_list.append((header, f'https://www.annemergmed.com{url}', authors))
df = pd.DataFrame(list(set(big_list)), columns = ['Title', 'Url', 'Authors'])
print(df.shape)
print(df.head(50))

This will return:

(409, 3)

Title   Url Authors
0   194 Challenging the Dogma of Radiographs a Joint Above and Below a Suspected Fracture: Quantification of Waste in Wrist Fracture Evaluation https://www.annemergmed.com/article/S0196-0644(21)01046-5/fulltext  M. Rozum,D. Mark Courtney,D. Diercks,S. McDonald
1   112 A Geographical Analysis of Access to Trauma Care From US National Parks in 2018 https://www.annemergmed.com/article/S0196-0644(21)00963-X/fulltext  S. Robichaud,K. Boggs,B. Bedell,...A. Sullivan,N. Harris,C. Camargo
2   87 Emergency Radiology Overreads Change Management of Transferred Patients With Traumatic Injuries  https://www.annemergmed.com/article/S0196-0644(21)00937-9/fulltext  M. Vrablik,R. Kessler,M. Vrablik,...J. Robinson,D. Hippe,M. Hall
 [...]

Answered By: platipus_on_fire

I am trying to navigate through the pages of a website and scrape its links but the same page data is scraped even after changing page number

Question:

Answers: