How does one call URLs from a csv file, scrape them, to distribute output into a second csv file?

Question

Python 3.11.2; PyCharm 2022.3.3 (Community Edition) – Build PC-223.8836.43; OS: Windows 11 Pro, 22H2, 22621.1413; Chrome 111.0.5563.65 (Official Build) (64-bit)

The editing box behaved wonkily, so I have omitted a couple of the intermediate attempts that were unsuccessful.

Is there a way to (1) call URLs in a one-column 10-item list contained in a csv (i.e., "caselist.csv"); and (2) execute a scraping script for each of those URLs (see below) and output all the data to a second csv file ("caselist_output.csv") in which the output is distributed in columns (i.e., case_title, case_plaintiff, case_defendant, case_number, case_filed, case_filed, court, case_nature_of_suit, case_cause_of_action, jury_demanded) and rows (each of the 10 cases contained in the csv file)?

The ten URLs contained in caselist.csv are:

https://dockets.justia.com/docket/alabama/alndce/6:2013cv01516/148887
https://dockets.justia.com/docket/arizona/azdce/2:2010cv02664/572428
https://dockets.justia.com/docket/arkansas/aredce/4:2003cv01507/20369
https://dockets.justia.com/docket/arkansas/aredce/4:2007cv00051/67198
https://dockets.justia.com/docket/arkansas/aredce/4:2007cv01067/69941
https://dockets.justia.com/docket/arkansas/aredce/4:2008cv00172/70993
https://dockets.justia.com/docket/arkansas/aredce/4:2008cv01288/73322
https://dockets.justia.com/docket/arkansas/aredce/4:2008cv01839/73965
https://dockets.justia.com/docket/arkansas/aredce/4:2008cv02513/74818
https://dockets.justia.com/docket/arkansas/aredce/4:2008cv02666/74976

After failing miserably with my own scripts, I tried @Driftr95’s two suggestions:

from bs4 import BeautifulSoup
import requests
import csv

th_fields = { 'case_plaintiff': 'Plaintiff', 'case_defendant': 'Defendant', 'case_number': 'Case Number',
              'case_filed': 'Filed', 'court': 'Court', 'case_nature_of_suit': 'Nature of Suit',
              'case_cause_of_action': 'Cause of Action',  'jury_demanded': 'Jury Demanded By' }
fgtParams = [('div', {'class': 'title-wrapper'})] + [('td', {'data-th': f}) for f in th_fields.values()]

with open('caselist.csv') as f:
    links = [l.strip() for l in f.read().splitlines() if l.strip().startswith('https://dockets.justia.com/docket')]

def find_get_text(bsTag, tName='div', tAttrs=None):
    t = bsTag.find(tName, {} if tAttrs is None else tAttrs)
    if t: return t.get_text(' ',strip=True) # safer as a conditional

def scrape_docketsjustia(djUrl, paramsList=fgtParams):
    soup = BeautifulSoup((r:=requests.get(djUrl)).content, 'lxml')
    cases_class = 'wrapper jcard has-padding-30 blocks has-no-bottom-padding'
    cases = soup.find_all('div', class_=cases_class)

    # print(f'{len(cases)} cases <{r.status_code} {r.reason}> from {r.url}')
    return [[find_get_text(c, n, a) for n, a in paramsList] for c in cases]

all_ouputs = []
for url in links:
    all_ouputs += scrape_docketsjustia(url)

with open("posts/caselist_output.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(['case_title', *th_fields]) # [ header row with column names ]
    writer.writerows(all_ouputs)

This script did not produce any output. Not really sure what’s going on…

I also tried @Driftr95’s second suggestion:

import requests
from bs4 import BeautifulSoup
import pandas as pd # [I just prefer pandas]

input_fp = 'caselist.csv'
output_fp = 'caselist_output.csv'
th_fields = { 'case_plaintiff': 'Plaintiff', 'case_defendant': 'Defendant', 'case_number': 'Case Number',
              'case_filed': 'Filed', 'court': 'Court', 'case_nature_of_suit': 'Nature of Suit',
              'case_cause_of_action': 'Cause of Action',  'jury_demanded': 'Jury Demanded By' }
fgtParams = [('case_title', 'div', {'class': 'title-wrapper'})] + [(k, 'td', {'data-th': f}) for k,f in th_fields.items()]
## function definitions ##

def find_get_text(bsTag, tName='div', tAttrs=None):
    t = bsTag.find(tName, {} if tAttrs is None else tAttrs)
    if t: return t.get_text(' ',strip=True)

def scrape_docketsjustia(djUrl, paramsList=fgtParams):
    soup = BeautifulSoup((r:=requests.get(djUrl)).content, 'lxml')
    cases_class = 'wrapper jcard has-padding-30 blocks has-no-bottom-padding'
    for c in soup.find_all('div', class_=cases_class):
        return {k:find_get_text(c,n,a) for k,n,a in paramsList}

    # return {} # just return empty row if cases_class can't be found
    return {'error_msg': f'no cases <{r.status_code} {r.reason}> from {r.url}'}
## main logic ##

## load list of links
# links = list(pd.read_csv(input_fp, header=None)[0]) # [ if you're sure ]
links = [l.strip() for l in pd.read_csv(input_fp)[0] # header will get filtered anyway
         if l.strip().startswith('https://dockets.justia.com/docket/')] # safer

## scrape for each link
df = pd.DataFrame([scrape_docketsjustia(u) for u in links])
# df = pd.DataFrame(map(scrape_docketsjustia,links)).dropna(axis='rows') # drop empty rows
# df['links'] = links # [ add another column with the links ]

## save scraped data
# df.to_csv(output_fp, index=False, header=False) # no column headers
df.to_csv(output_fp, index=False)

This produced the following error messages:

Traceback (most recent call last):
File "C:UserscsAppDataLocalProgramsPythonPython311Libsite-packagespandascoreindexesbase.py", line 3802, in get_loc
return self._engine.get_loc(casted_key)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "pandas_libsindex.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libsindex.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libshashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas_libshashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "C:UserscsPycharmProjectspythonProject1solution2.py", line 29, in
links = [l.strip() for l in pd.read_csv(input_fp)[0] # header will get filtered anyway
~~~~~~~~~~~~~~~~~~~~~^^^
File "C:UserscsAppDataLocalProgramsPythonPython311Libsite-packagespandascoreframe.py", line 3807, in getitem
indexer = self.columns.get_loc(key)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:UserscsAppDataLocalProgramsPythonPython311Libsite-packagespandascoreindexesbase.py", line 3804, in get_loc
raise KeyError(key) from err
KeyError: 0

I just ran the script, which I thought worked – but now, all of a sudden, it returns no output (even with the revised links = [l.strip() for l in pd.read_csv(input_fp , header=None )[0] if l.strip().startswith('https://dockets.justia.com/docket/')]:

import requests
from bs4 import BeautifulSoup
import pandas as pd # [I just prefer pandas]

input_fp = 'caselist.csv'
output_fp = 'caselist_output.csv'
th_fields = { 'case_plaintiff': 'Plaintiff', 'case_defendant': 'Defendant', 'case_number': 'Case Number',
              'case_filed': 'Filed', 'court': 'Court', 'case_nature_of_suit': 'Nature of Suit',
              'case_cause_of_action': 'Cause of Action',  'jury_demanded': 'Jury Demanded By' }
fgtParams = [('case_title', 'div', {'class': 'title-wrapper'})] + [(k, 'td', {'data-th': f}) for k,f in th_fields.items()]
## function definitions ##

def find_get_text(bsTag, tName='div', tAttrs=None):
    t = bsTag.find(tName, {} if tAttrs is None else tAttrs)
    if t: return t.get_text(' ',strip=True)

def scrape_docketsjustia(djUrl, paramsList=fgtParams):
    soup = BeautifulSoup((r:=requests.get(djUrl)).content, 'lxml')
    cases_class = 'wrapper jcard has-padding-30 blocks has-no-bottom-padding'
    for c in soup.find_all('div', class_=cases_class):
        return {k:find_get_text(c,n,a) for k,n,a in paramsList}

    # return {} # just return empty row if cases_class can't be found
    return {'error_msg': f'no cases <{r.status_code} {r.reason}> from {r.url}'}
## main logic ##

## load list of links
# links = list(pd.read_csv(input_fp, header=None)[0]) # [ if you're sure ]
links = [l.strip() for l in pd.read_csv(input_fp , header=None )[0] if l.strip().startswith('https://dockets.justia.com/docket/')] # safer

## scrape for each link
df = pd.DataFrame([scrape_docketsjustia(u) for u in links])
# df = pd.DataFrame(map(scrape_docketsjustia,links)).dropna(axis='rows') # drop empty rows
# df['links'] = links # [ add another column with the links ]

## save scraped data
# df.to_csv(output_fp, index=False, header=False) # no column headers
df.to_csv(output_fp, index=False)

Asked By: PressMeister

||

Source

Answer 1

Solution V1

Is there a way to (1) call URLs in a one-column 10-item list contained in a csv (i.e., "caselist.csv")

With csv.reader you would use something like

# import csv
with open('caselist.csv', newline='') as f:
    links = [l for l, *_ in csv.reader(f)]

Although, since it’s just a single column with no indices of headers, you don’t actually need the csv module for this part. You could just use f.read() like with open('caselist.csv') as f: links = f.read().splitlines(), or to be safer:

with open('caselist.csv') as f:
    links = [l.strip() for l in f.read().splitlines() if l.strip().startswith('https://dockets.justia.com/docket')]

and (2) execute a scraping script for each of those URLs (see below)

You could wrap your current code [except for the csv.writer block] in a function that takes the URL as an input and returns the output list; but your current code has some repetitive part which I think can be reduced to something like

from bs4 import BeautifulSoup
import requests
th_fields = { 'case_plaintiff': 'Plaintiff', 'case_defendant': 'Defendant', 'case_number': 'Case Number', 
              'case_filed': 'Filed', 'court': 'Court', 'case_nature_of_suit': 'Nature of Suit', 
              'case_cause_of_action': 'Cause of Action',  'jury_demanded': 'Jury Demanded By' }
fgtParams = [('div', {'class': 'title-wrapper'})] + [('td', {'data-th': f}) for f in th_fields.values()]

## a simpler version of my selectGet function [ https://pastebin.com/ZnZ7xM6u ] 
def find_get_text(bsTag, tName='div', tAttrs=None):
    t = bsTag.find(tName, {} if tAttrs is None else tAttrs)
    if t: return t.get_text(' ',strip=True) # safer as a conditional

def scrape_docketsjustia(djUrl, paramsList=fgtParams):
    soup = BeautifulSoup((r:=requests.get(djUrl)).content, 'lxml')
    cases_class = 'wrapper jcard has-padding-30 blocks has-no-bottom-padding'
    cases = soup.find_all('div', class_=cases_class)

    # print(f'{len(cases)} cases <{r.status_code} {r.reason}> from {r.url}')
    return [[find_get_text(c, n, a) for n, a in paramsList] for c in cases]

Once you have the function, you can just loop over all the URLs to collect all the outputs:

all_ouputs = []
for url in links: 
    all_ouputs += scrape_docketsjustia(url)

and output all the data to a second csv file ("caselist_output.csv")

You can save all_ouputs the same way you were saving output, although you can also use the keys of th_fields as column headers if you want:

with open("posts/caselist_output.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(['case_title', *th_fields]) # [ header row with column names ]
    writer.writerows(all_ouputs)

Solution V2

and rows (each of the 10 cases contained in the csv file)

I hadn’t noticed this at first, but if you’re expecting only one row for every link, there’s no need to return a list from scrape_docketsjustia – it can just return that one row. So the solution can be adjusted to something like

## setup ##

import requests
from bs4 import BeautifulSoup
import pandas as pd # [I just prefer pandas]

input_fp = 'caselist.csv'
output_fp = 'posts/caselist_output.csv'
th_fields = { 'case_plaintiff': 'Plaintiff', 'case_defendant': 'Defendant', 'case_number': 'Case Number', 
              'case_filed': 'Filed', 'court': 'Court', 'case_nature_of_suit': 'Nature of Suit', 
              'case_cause_of_action': 'Cause of Action',  'jury_demanded': 'Jury Demanded By' }
fgtParams = [('case_title', 'div', {'class': 'title-wrapper'})] + [(k, 'td', {'data-th': f}) for k,f in th_fields.items()]

## function definitions ##

def find_get_text(bsTag, tName='div', tAttrs=None):
    t = bsTag.find(tName, {} if tAttrs is None else tAttrs)
    if t: return t.get_text(' ',strip=True)

def scrape_docketsjustia(djUrl, paramsList=fgtParams):
    soup = BeautifulSoup((r:=requests.get(djUrl)).content, 'lxml')
    cases_class = 'wrapper jcard has-padding-30 blocks has-no-bottom-padding'
    for c in soup.find_all('div', class_=cases_class): 
        return {k:find_get_text(c,n,a) for k,n,a in paramsList}

    # return {} # just return empty row if cases_class can't be found
    return {'error_msg': f'no cases <{r.status_code} {r.reason}> from {r.url}'}

## main logic ##

## load list of links 
# links = list(pd.read_csv(input_fp, header=None)[0]) # [ if you're sure ]
links = [l.strip() for l in pd.read_csv(input_fp, header=None)[0]  
         if l.strip().startswith('https://dockets.justia.com/docket/')] # safer

## scrape for each link
df = pd.DataFrame([scrape_docketsjustia(u) for u in links])
# df = pd.DataFrame(map(scrape_docketsjustia,links)).dropna(axis='rows') # drop empty rows
# df['links'] = links # [ add another column with the links ]

## save scraped data
# df.to_csv(output_fp, index=False, header=False) # no column headers
df.to_csv(output_fp, index=False)

Answered By: Driftr95

How does one call URLs from a csv file, scrape them, to distribute output into a second csv file?

Question:

Answers:

Solution V1

Solution V2