Create nested list from another list based on byte size check to dynamically incremented nested list name
Question:
I have a list filled with paragraph tags (<p></p>) content from a site using beautifulsoup4.
I would like to break that list into a nested list with the sublists name being dynamically incremented and have this incrementation to be based on a byte size check of the current nested list. The result should be used to create a json object afterwards.
My current code for example:
import requests
from bs4 import BeautifulSoup
def getContent():
page = requests.get("www.example.com")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.prettify()
data = {}
SECTION_INDEX = 1
data_container = []
total_article_size = 0
article_section_data = []
for tag in soup.find_all("p"):
text = tag.text
data_container.append(text)
for p in data_container:
article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
article_section_data.append(p)
data[article_section] = article_section_data
if article_section_size >= 300:
SECTION_INDEX = SECTION_INDEX + 1
return(data)
def createJson():
data = getContent()
json_source = {
"ARTICLE_DATA": data
}
json_object = json.dumps(json_source, indent=2)
def main():
createJson()
The actual result:
{
"CONTENT_DATA": {
"CONTENT_SECTION_1": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
],
"CONTENT_SECTION_2": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
],
"CONTENT_SECTION_3": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
]
}
}
The desired result:
{
"CONTENT_DATA": {
"CONTENT_SECTION_1": [
"the actual paragraphs",
"content goes there"
],
"CONTENT_SECTION_2": [
"some more content",
"even more content from the site"
],
"CONTENT_SECTION_3": [
"and some even more",
"and finally, some more"
]
}
}
How to achieve this and why the repeated pattern from the actual result above?
Answers:
To achieve the desired result, you can track the size of the current article section using sys.getsizeof
function and split the data_container
list into smaller lists based on the desired byte size. Here’s the updated code:
import requests
from bs4 import BeautifulSoup
import sys
def getContent():
page = requests.get("www.example.com")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.prettify()
data = {}
SECTION_INDEX = 1
data_container = []
article_section_size = 0
article_section_data = []
for tag in soup.find_all("p"):
text = tag.text
data_container.append(text)
for p in data_container:
article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
article_section_data.append(p)
article_section_size += sys.getsizeof(p)
if article_section_size >= 300:
data[article_section] = article_section_data
article_section_data = []
article_section_size = 0
SECTION_INDEX = SECTION_INDEX + 1
if article_section_data:
data[article_section] = article_section_data
return(data)
The repeated pattern from the actual result is due to the fact that you were always appending the p
element to the article_section_data
list, rather than resetting it to an empty list when the desired byte size has been reached.
I have a list filled with paragraph tags (<p></p>) content from a site using beautifulsoup4.
I would like to break that list into a nested list with the sublists name being dynamically incremented and have this incrementation to be based on a byte size check of the current nested list. The result should be used to create a json object afterwards.
My current code for example:
import requests
from bs4 import BeautifulSoup
def getContent():
page = requests.get("www.example.com")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.prettify()
data = {}
SECTION_INDEX = 1
data_container = []
total_article_size = 0
article_section_data = []
for tag in soup.find_all("p"):
text = tag.text
data_container.append(text)
for p in data_container:
article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
article_section_data.append(p)
data[article_section] = article_section_data
if article_section_size >= 300:
SECTION_INDEX = SECTION_INDEX + 1
return(data)
def createJson():
data = getContent()
json_source = {
"ARTICLE_DATA": data
}
json_object = json.dumps(json_source, indent=2)
def main():
createJson()
The actual result:
{
"CONTENT_DATA": {
"CONTENT_SECTION_1": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
],
"CONTENT_SECTION_2": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
],
"CONTENT_SECTION_3": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
]
}
}
The desired result:
{
"CONTENT_DATA": {
"CONTENT_SECTION_1": [
"the actual paragraphs",
"content goes there"
],
"CONTENT_SECTION_2": [
"some more content",
"even more content from the site"
],
"CONTENT_SECTION_3": [
"and some even more",
"and finally, some more"
]
}
}
How to achieve this and why the repeated pattern from the actual result above?
To achieve the desired result, you can track the size of the current article section using sys.getsizeof
function and split the data_container
list into smaller lists based on the desired byte size. Here’s the updated code:
import requests
from bs4 import BeautifulSoup
import sys
def getContent():
page = requests.get("www.example.com")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.prettify()
data = {}
SECTION_INDEX = 1
data_container = []
article_section_size = 0
article_section_data = []
for tag in soup.find_all("p"):
text = tag.text
data_container.append(text)
for p in data_container:
article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
article_section_data.append(p)
article_section_size += sys.getsizeof(p)
if article_section_size >= 300:
data[article_section] = article_section_data
article_section_data = []
article_section_size = 0
SECTION_INDEX = SECTION_INDEX + 1
if article_section_data:
data[article_section] = article_section_data
return(data)
The repeated pattern from the actual result is due to the fact that you were always appending the p
element to the article_section_data
list, rather than resetting it to an empty list when the desired byte size has been reached.