Scrapy – Extract text with new line presented in the rendered html page

Question:

I am scraping this website https://www.handbook.fca.org.uk/handbook/PRIN/2A/?date=2030-12-01&timeline=True&view=chapter and I’d like to extract this part References in PRIN to the obligations on firms under Principle 12 include the obligations imposed by rules in PRIN 2A. with newline as presented in the html page right now I am getting all the text in one line. Here is the html source code:

<p><a href="/instrument/2022/FCA_2022_31.pdf" target="_blank" title="FCA 2022/31 - 31/07/2023" class="changed-by">1</a>References in <a href="/handbook/glossary/G908.html?date=2030-12-01" class="autodeftext">PRIN</a> to the obligations on <a href="/handbook/glossary/G430.html?date=2030-12-01" class="autodeftext">firms</a> under <a href="/handbook/glossary/G910.html?date=2030-12-01" class="autodeftext">Principle</a> 12 include the obligations imposed by <a href="/handbook/glossary/G1036.html?date=2030-12-01" class="autodeftext">rules</a> in <span class="xref"><span class="xrefout"><a href="/handbook/PRIN/2A/?date=2030-12-01#D1">PRIN 2A</a></span></span>.</p>

And here is the scraper code:

import scrapy
from urllib.parse import urlencode


class HandBook(scrapy.Spider):
    # name of the spider - scrapy crawl handbook_spider
    name = "handbook_spider"

    # custom settings for the spider to enable customize log file and transforming extracted data to xlsx format via custom pipeline
    custom_settings = {
        "LOG_FILE": "handbook_spider.log",
        "ITEM_PIPELINES": {
            "handbook_spider.pipelines.HandbookExcelPipeline": 300,
        },
    }

    # headers to request the website in order to imitate the real browser instead of random crawling bot
    headers = {
        "authority": "www.handbook.fca.org.uk",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "accept-language": "en,ru;q=0.9",
        "cache-control": "max-age=0",
        "sec-ch-ua": '"Chromium";v="106", "Yandex";v="22", "Not;A=Brand";v="99"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "cross-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 YaBrowser/22.11.3.832 (beta) Yowser/2.5 Safari/537.36",
    }

    # parameters
    params = {
        "date": "2030-12-01",
        "timeline": "True",
        "view": "chapter",
    }

    url = "https://www.handbook.fca.org.uk/handbook/PRIN/3/?"

    # starting point of scraper
    def start_requests(self):
        base_url = self.url + urlencode(self.params)
        yield scrapy.Request(
            url=base_url, headers=self.headers, callback=self.parse_details
        )

    # parsing the content
    def parse_details(self, response):
        for content in response.css("div.handbook-content"):
            chapter_ref = content.xpath(
                "./header/h1/span[@class='extended']/text()"
            ).get()
            chapter = "".join(content.xpath("./header/h1/text()").getall()).strip()
            topic = None
            for section in content.css("section"):
                header = section.css("h2.crosstitle::text")
                if header:
                    topic = header.get()
                else:
                    table_content = section.css("div.section-content-table")
                    if table_content:
                        topic = section.xpath(".//header/h3/text()").get()
                        content = table_content.xpath(".//table//text()").getall()
                    else:
                        content = section.xpath(
                            ".//div[@class='section-content']//text()"
                        ).getall()
                    clause_text = " ".join(list(map(str.strip, content)))
                    uid = section.xpath(".//span[@class='extended']/text()").get()
                    if section.css("span.section-type").get() is not None:
                        yield {
                            "Unique_ids": uid,
                            "Chapter_ref": chapter_ref,
                            "Chapter": chapter,
                            "Topic": topic,
                            "Clause": uid.split(".")[-2],
                            "Sub_Clause": uid.split(".")[-1],
                            "Type": section.css("span.section-type::text").get(),
                            "Date_applicable": section.xpath(
                                ".//time/span/text()"
                            ).get(),
                            "Text": clause_text,
                        }

Can anyone please help me to extract the text with newline intact? Thanks!

Asked By: X-somtheing

||

Answers:

Unfortunately there is not structured way to extract that line break. The new line that you see in your browser is only there because of the size of the surrounding page elements. There is no html that formally tells that paragraph where to line break in this instance.

To demonstrate you can open up your browsers dev tools and adjust the css controls to position the line break to occur at the end of a different portion of the text.

enter image description here

Answered By: Alexander
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.