Create nested list from another list based on byte size check to dynamically incremented nested list name

Question:

I have a list filled with paragraph tags (<p></p>) content from a site using beautifulsoup4.

I would like to break that list into a nested list with the sublists name being dynamically incremented and have this incrementation to be based on a byte size check of the current nested list. The result should be used to create a json object afterwards.

My current code for example:

import requests
from bs4 import BeautifulSoup


def getContent():

    page = requests.get("www.example.com")
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.prettify()
    
    data = {}
    SECTION_INDEX = 1
    data_container = []
    total_article_size = 0
    article_section_data = []



    for tag in soup.find_all("p"):
        text = tag.text
        data_container.append(text)

    for p in data_container:
        article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
        article_section_data.append(p)
        data[article_section] = article_section_data


        if article_section_size >= 300:
            SECTION_INDEX = SECTION_INDEX + 1

    return(data)

def createJson():
    data = getContent()
    json_source = {
                      "ARTICLE_DATA": data
                  }

    json_object = json.dumps(json_source, indent=2)


def main():
    createJson()

The actual result:

{
  "CONTENT_DATA": {
    "CONTENT_SECTION_1": [
      "the actual paragraphs",
      "content goes there",
      "some more content".
      "even more content from the site",
      "and some even more",
      "and finally, some more"
    ],
    "CONTENT_SECTION_2": [
      "the actual paragraphs",
      "content goes there",
      "some more content".
      "even more content from the site",
      "and some even more",
      "and finally, some more"
    ],
    "CONTENT_SECTION_3": [
      "the actual paragraphs",
      "content goes there",
      "some more content".
      "even more content from the site",
      "and some even more",
      "and finally, some more"
    ]
  }
}

The desired result:

{
  "CONTENT_DATA": {
    "CONTENT_SECTION_1": [
      "the actual paragraphs",
      "content goes there"
    ],
    "CONTENT_SECTION_2": [
      "some more content",
      "even more content from the site"
    ],
    "CONTENT_SECTION_3": [
      "and some even more",
      "and finally, some more"
    ]
  }
}

How to achieve this and why the repeated pattern from the actual result above?

Asked By: NIGHTSCROLLER

||

Answers:

To achieve the desired result, you can track the size of the current article section using sys.getsizeof function and split the data_container list into smaller lists based on the desired byte size. Here’s the updated code:

import requests
from bs4 import BeautifulSoup
import sys

def getContent():

    page = requests.get("www.example.com")
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.prettify()
    
    data = {}
    SECTION_INDEX = 1
    data_container = []
    article_section_size = 0
    article_section_data = []

    for tag in soup.find_all("p"):
        text = tag.text
        data_container.append(text)

    for p in data_container:
        article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
        article_section_data.append(p)
        article_section_size += sys.getsizeof(p)

        if article_section_size >= 300:
            data[article_section] = article_section_data
            article_section_data = []
            article_section_size = 0
            SECTION_INDEX = SECTION_INDEX + 1

    if article_section_data:
        data[article_section] = article_section_data

    return(data)

The repeated pattern from the actual result is due to the fact that you were always appending the p element to the article_section_data list, rather than resetting it to an empty list when the desired byte size has been reached.

Answered By: GodFather