Convert html table to json with BeautifulSoup
Question:
I am trying to convert HTML table to json using beautifulsoup() function python, I was able to convert but the data coming in wrong json format.
from bs4 import BeautifulSoup
import json
reading_table = """
<table>
<tbody>
<tr>
<td><span class="customlabel">Energy Source</span></td>
<td><span class="custominput">EB</span></td>
<td><span class="customlabel">Grid Reading </span></td>
<td><span class="custominput">2666.2</span></td>
<td><span class="customlabel">DG Reading </span></td>
<td><span class="custominput">15.5</span></td>
</tr>
<tr>
<td><span class="customlabel">Power Factor</span></td>
<td><span class="custominput">0.844</span></td>
<td><span class="customlabel">Total Kw</span></td>
<td><span class="custominput">0.273</span></td>
<td><span class="customlabel">Total KVA</span></td>
<td><span class="custominput">0.34</span></td>
</tr>
<tr>
<td><span class="customlabel">Average Voltage</span></td>
<td><span class="custominput">241.7</span></td>
<td><span class="customlabel">Total Current</span></td>
<td><span class="custominput">1.54</span></td>
<td><span class="customlabel">Frequency Hz</span></td>
<td><span class="custominput">50</span></td>
</tr>
</tbody>
</table>
"""
reading_table_data = [
[cell.text for cell in row("td")]
for row in BeautifulSoup(reading_table, features="html.parser")("tr")
]
print(reading_table_data)
The above code prints JSON in the below format.
[['Energy Source', 'EB', 'Grid Reading ', '2666.2', 'DG Reading ', '15.5'], ['Power Factor', '0.844', 'Total Kw', '0.273', 'Total KVA', '0.34'], ['Average Voltage', '241.7', 'Total Current', '1.54', 'Frequency Hz', '50']]
I would like to get it in below format
[
'Energy Source': 'EB',
'Grid Reading ': '2666.2'
'DG Reading ', '15.5',
'Power Factor', '0.844',
'Total Kw', '0.273',
'Total KVA', '0.34',
'Average Voltage', '241.7',
'Total Current', '1.54',
'Frequency Hz', '50'
]
Some help is appreciated
Answers:
The output you want is not a valid format, so you can print it after converting the dict to string and replacing the braces.
Here is the working code:
tds = BeautifulSoup(reading_table, features="html.parser").findAll("td")
data = {}
for td in tds:
if "customlabel" in td.span.get("class"):
attr_key = td.span.text
data[attr_key] = ""
if "custominput" in td.span.get("class"):
attr_value = td.span.text
data[attr_key] = attr_value
print(json.dumps(data).replace("{", "[").replace("}", "]"))
Right now you are reading the cell value of each td. What you need is to differentiate between key and value depending on the css class of the td. After that you will need to use dictionary comprehension rather than list comprehension.
You can try this:
from bs4 import BeautifulSoup
import json
reading_table = """
<table>
<tbody>
<tr>
<td><span class="customlabel">Energy Source</span></td>
<td><span class="custominput">EB</span></td>
<td><span class="customlabel">Grid Reading </span></td>
<td><span class="custominput">2666.2</span></td>
<td><span class="customlabel">DG Reading </span></td>
<td><span class="custominput">15.5</span></td>
</tr>
<tr>
<td><span class="customlabel">Power Factor</span></td>
<td><span class="custominput">0.844</span></td>
<td><span class="customlabel">Total Kw</span></td>
<td><span class="custominput">0.273</span></td>
<td><span class="customlabel">Total KVA</span></td>
<td><span class="custominput">0.34</span></td>
</tr>
<tr>
<td><span class="customlabel">Average Voltage</span></td>
<td><span class="custominput">241.7</span></td>
<td><span class="customlabel">Total Current</span></td>
<td><span class="custominput">1.54</span></td>
<td><span class="customlabel">Frequency Hz</span></td>
<td><span class="custominput">50</span></td>
</tr>
</tbody>
</table>
"""
reading_table_data = [
row.text for row in
BeautifulSoup(reading_table, features="html.parser")("td")
]
data = (
json.dumps(
{
k: v for k, v in
zip(reading_table_data[::2], reading_table_data[1::2])
},
indent=4,
)
)
print(data)
Output:
{
"Energy Source": "EB",
"Grid Reading ": "2666.2",
"DG Reading ": "15.5",
"Power Factor": "0.844",
"Total Kw": "0.273",
"Total KVA": "0.34",
"Average Voltage": "241.7",
"Total Current": "1.54",
"Frequency Hz": "50"
}
I am trying to convert HTML table to json using beautifulsoup() function python, I was able to convert but the data coming in wrong json format.
from bs4 import BeautifulSoup
import json
reading_table = """
<table>
<tbody>
<tr>
<td><span class="customlabel">Energy Source</span></td>
<td><span class="custominput">EB</span></td>
<td><span class="customlabel">Grid Reading </span></td>
<td><span class="custominput">2666.2</span></td>
<td><span class="customlabel">DG Reading </span></td>
<td><span class="custominput">15.5</span></td>
</tr>
<tr>
<td><span class="customlabel">Power Factor</span></td>
<td><span class="custominput">0.844</span></td>
<td><span class="customlabel">Total Kw</span></td>
<td><span class="custominput">0.273</span></td>
<td><span class="customlabel">Total KVA</span></td>
<td><span class="custominput">0.34</span></td>
</tr>
<tr>
<td><span class="customlabel">Average Voltage</span></td>
<td><span class="custominput">241.7</span></td>
<td><span class="customlabel">Total Current</span></td>
<td><span class="custominput">1.54</span></td>
<td><span class="customlabel">Frequency Hz</span></td>
<td><span class="custominput">50</span></td>
</tr>
</tbody>
</table>
"""
reading_table_data = [
[cell.text for cell in row("td")]
for row in BeautifulSoup(reading_table, features="html.parser")("tr")
]
print(reading_table_data)
The above code prints JSON in the below format.
[['Energy Source', 'EB', 'Grid Reading ', '2666.2', 'DG Reading ', '15.5'], ['Power Factor', '0.844', 'Total Kw', '0.273', 'Total KVA', '0.34'], ['Average Voltage', '241.7', 'Total Current', '1.54', 'Frequency Hz', '50']]
I would like to get it in below format
[
'Energy Source': 'EB',
'Grid Reading ': '2666.2'
'DG Reading ', '15.5',
'Power Factor', '0.844',
'Total Kw', '0.273',
'Total KVA', '0.34',
'Average Voltage', '241.7',
'Total Current', '1.54',
'Frequency Hz', '50'
]
Some help is appreciated
The output you want is not a valid format, so you can print it after converting the dict to string and replacing the braces.
Here is the working code:
tds = BeautifulSoup(reading_table, features="html.parser").findAll("td")
data = {}
for td in tds:
if "customlabel" in td.span.get("class"):
attr_key = td.span.text
data[attr_key] = ""
if "custominput" in td.span.get("class"):
attr_value = td.span.text
data[attr_key] = attr_value
print(json.dumps(data).replace("{", "[").replace("}", "]"))
Right now you are reading the cell value of each td. What you need is to differentiate between key and value depending on the css class of the td. After that you will need to use dictionary comprehension rather than list comprehension.
You can try this:
from bs4 import BeautifulSoup
import json
reading_table = """
<table>
<tbody>
<tr>
<td><span class="customlabel">Energy Source</span></td>
<td><span class="custominput">EB</span></td>
<td><span class="customlabel">Grid Reading </span></td>
<td><span class="custominput">2666.2</span></td>
<td><span class="customlabel">DG Reading </span></td>
<td><span class="custominput">15.5</span></td>
</tr>
<tr>
<td><span class="customlabel">Power Factor</span></td>
<td><span class="custominput">0.844</span></td>
<td><span class="customlabel">Total Kw</span></td>
<td><span class="custominput">0.273</span></td>
<td><span class="customlabel">Total KVA</span></td>
<td><span class="custominput">0.34</span></td>
</tr>
<tr>
<td><span class="customlabel">Average Voltage</span></td>
<td><span class="custominput">241.7</span></td>
<td><span class="customlabel">Total Current</span></td>
<td><span class="custominput">1.54</span></td>
<td><span class="customlabel">Frequency Hz</span></td>
<td><span class="custominput">50</span></td>
</tr>
</tbody>
</table>
"""
reading_table_data = [
row.text for row in
BeautifulSoup(reading_table, features="html.parser")("td")
]
data = (
json.dumps(
{
k: v for k, v in
zip(reading_table_data[::2], reading_table_data[1::2])
},
indent=4,
)
)
print(data)
Output:
{
"Energy Source": "EB",
"Grid Reading ": "2666.2",
"DG Reading ": "15.5",
"Power Factor": "0.844",
"Total Kw": "0.273",
"Total KVA": "0.34",
"Average Voltage": "241.7",
"Total Current": "1.54",
"Frequency Hz": "50"
}