How would you scrape a website into a heading and content CSV file using BeautifulSoup?

Question:

I am trying to scrape the sci-kit documentation (https://scikit-learn.org/stable/modules/linear_model.html) into a 2 column CSV: the first would be the headings in the documentation and the second column would be the content under that heading. Even if there are multiple paragraphs under the heading, it should be in the same row as the heading it is under.

This is my current code:

from bs4 import BeautifulSoup
import requests

page_link = 'https://scikit-learn.org/stable/modules/linear_model.html'
page_response = requests.get(page_link,verify=False, timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")
textContent = []
for tag in page_content.find_all('h2')[1:]:
    texth2=tag.text.strip()
    textContent.append(texth2)
    for item in tag.find_next_siblings('p'):
        if texth2 in item.find_previous_siblings('h2')[0].text.strip():
            textContent.append(item.text.strip())
Asked By: user20140267

||

Answers:

Try:

import re
import requests
import pandas as pd
from bs4 import BeautifulSoup


url = "https://scikit-learn.org/stable/modules/linear_model.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")


def get_section(tag):
    name = tag.find(["h1", "h2", "h3", "h4"]).text.strip("ΒΆ")
    text = []
    for s in tag.find_all(recursive=False):
        if s.name == "section":
            yield from get_section(s)
        elif s.name not in {"h1", "h2", "h3", "h4"}:
            text.append(
                s.get_text(strip=True, separator=" ").replace("n", " ")
            )
    yield name, " ".join(text).strip()


root = soup.select_one("div[role=main] section")

df = pd.DataFrame(
    tuple(
        sorted(
            get_section(root),
            key=lambda v: tuple(map(int, re.findall(r"d+", v[0]))),
        )
    ),
    columns=["Heading", "Value"],
)

print(df.head(10).to_markdown(index=False))

df.to_csv("data.csv", index=False)

Prints the dataframe and saves data.csv (screenshot from LibreOffice):

enter image description here

Answered By: Andrej Kesely
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.