Is there a really simple method for printing scraped output to a csv file?

Question

Python: Python 3.11.2
Python Editor: PyCharm 2022.3.3 (Community Edition) – Build PC-223.8836.43
OS: Windows 11 Pro, 22H2, 22621.1413
Browser: Chrome 111.0.5563.65 (Official Build) (64-bit)

I have a URL (e.g., https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963) from which I’m scraping nine items. I’m looking to have the script create a csv file and write my scraped output (nine items) to columns in the csv file. Is there a really simple way of doing this?

from bs4 import BeautifulSoup
import requests
import csv

html_text = requests.get("https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963").text
soup = BeautifulSoup(html_text, "lxml")
cases = soup.find_all("div", class_ = "wrapper jcard has-padding-30 blocks has-no-bottom-padding")

for case in cases:
    case_title = case.find("div", class_ = "title-wrapper").text.replace(" "," ")
    case_plaintiff = case.find("td", {"data-th": "Plaintiff"}).text.replace(" "," ")
    case_defendant = case.find("td", {"data-th": "Defendant"}).text.replace(" "," ")
    case_number = case.find("td", {"data-th": "Case Number"}).text.replace(" "," ")
    case_filed = case.find("td", {"data-th": "Filed"}).text.replace(" "," ")
    court = case.find("td", {"data-th": "Court"}).text.replace(" "," ")
    case_nature_of_suit = case.find("td", {"data-th": "Nature of Suit"}).text.replace(" "," ")
    case_cause_of_action = case.find("td", {"data-th": "Cause of Action"}).text.replace(" "," ")
    jury_demanded = case.find("td", {"data-th": "Jury Demanded By"}).text.replace(" "," ")

    print(f"{case_title.strip()}")
    print(f"{case_plaintiff.strip()}")
    print(f"{case_defendant.strip()}")
    print(f"{case_number.strip()}")
    print(f"{case_filed.strip()}")
    print(f"{court.strip()}")
    print(f"{case_nature_of_suit.strip()}")
    print(f"{case_cause_of_action.strip()}")
    print(f"{jury_demanded.strip()}")

Asked By: Tollundium

||

Source

Answer 1

Generate a list of lists with your data and dump that out to a csv:

from bs4 import BeautifulSoup
import requests
import csv

html_text = requests.get("https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963").text
soup = BeautifulSoup(html_text, "lxml")
cases = soup.find_all("div", class_ = "wrapper jcard has-padding-30 blocks has-no-bottom-padding")

output = []
for case in cases:
    case_title = case.find("div", class_ = "title-wrapper").text.replace(" "," ")
    case_plaintiff = case.find("td", {"data-th": "Plaintiff"}).text.replace(" "," ")
    case_defendant = case.find("td", {"data-th": "Defendant"}).text.replace(" "," ")
    case_number = case.find("td", {"data-th": "Case Number"}).text.replace(" "," ")
    case_filed = case.find("td", {"data-th": "Filed"}).text.replace(" "," ")
    court = case.find("td", {"data-th": "Court"}).text.replace(" "," ")
    case_nature_of_suit = case.find("td", {"data-th": "Nature of Suit"}).text.replace(" "," ")
    case_cause_of_action = case.find("td", {"data-th": "Cause of Action"}).text.replace(" "," ")
    jury_demanded = case.find("td", {"data-th": "Jury Demanded By"}).text.replace(" "," ")

    output.append([
                    case_title.strip()
                    ,case_plaintiff.strip()
                    ,case_defendant.strip()
                    ,case_number.strip()
                    ,case_filed.strip()
                    ,court.strip()
                    ,case_nature_of_suit.strip()
                    ,case_cause_of_action.strip()
                    ,jury_demanded.strip()
                  ])


with open("output.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(output)

Answered By: JNevill

Answer 2

Sure – the easiest is the standard library csv module.

I took the liberty of refactoring your .replace().strip() stuff with a single function; we’re also gathering up all of the case data into a list-of-dicts first before writing it out to a file. This makes it easier to add new columns without having to deal with their names twice.

from bs4 import BeautifulSoup
import requests
import csv


def process_text(s):
    return s.replace(" ", " ").strip()


html_text = requests.get("https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963").text
soup = BeautifulSoup(html_text, "lxml")
cases = soup.find_all("div", class_="wrapper jcard has-padding-30 blocks has-no-bottom-padding")

data = []

for case in cases:
    data.append(
        {
            "case_title": process_text(case.find("div", class_="title-wrapper")),
            "case_plaintiff": process_text(case.find("td", {"data-th": "Plaintiff"})),
            "case_defendant": process_text(case.find("td", {"data-th": "Defendant"})),
            "case_number": process_text(case.find("td", {"data-th": "Case Number"})),
            "case_filed": process_text(case.find("td", {"data-th": "Filed"})),
            "court": process_text(case.find("td", {"data-th": "Court"})),
            "case_nature_of_suit": process_text(case.find("td", {"data-th": "Nature of Suit"})),
            "case_cause_of_action": process_text(case.find("td", {"data-th": "Cause of Action"})),
            "jury_demanded": process_text(case.find("td", {"data-th": "Jury Demanded By"})),
        }
    )

with open("cases.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=data[0].keys())
    writer.writeheader()
    writer.writerows(data)

Answered By: AKX

Answer 3

pandas has a .to_csv method.

# import pandas as pd
csv_filename = 'x.csv' #<--name or path to file

th_fields = ['Plaintiff', 'Defendant', 'Case Number', 'Filed', 'Court', 
             'Nature of Suit', 'Cause of Action', 'Jury Demanded By']
case_rows = []
for c in cases:
    title = c.find("div", class_ = "title-wrapper")
    row = {'title': title.text.strip()} if title else {}
    for td in c.find_all('td',{'data-th':(lambda th: th in th_fields)}): 
        row[td['data-th']] = td.text.strip()
    case_rows.append(row)

pd.DataFrame(case_rows).to_csv(csv_filename, index=False)
# pd.DataFrame(case_rows).T.to_csv(csv_filename, index=False, header=False)

You can also transpose it with .T.

You can also get all the tables from the URL directly with read_html

# import pandas as pd
url=f'https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963'
csv_filename = 'x.csv'
for ti, df in enumerate(pd.read_html(url)):
    df.to_csv(csv_filename, mode='a' if ti else 'w', index=False)
    pd.DataFrame({'':[]}).to_csv(csv_filename, mode='a')

Answered By: Driftr95

Is there a really simple method for printing scraped output to a csv file?

Question:

Answers: