How to get weather data from site?
Question:
I’m not able to get the Dew_Point and Wind Data from the site https://weather.gc.ca/city/pages/ab-52_metric_e.html
import requests
from lxml import html
# Get the html page
resp=requests.get("https://weather.gc.ca/city/pages/ab-52_metric_e.html")
# Build html tree
html_tree=html.fromstring(resp.text)
#Dew_point=html_tree.xpath("//dd[@class='mrgn-bttm-0 wxo-metric-hide'][(parent::dl[@class='dl-horizontal wxo-conds-col2'])]//text()")[1].replace("Â", "")
# Print Dew_point
#print(f"Dew_point in {city_name} is {Dew_point}")
#Wind=html_tree.xpath("//dd[@class='longContent mrgn-bttm-0 wxo-metric-hide'][(parent::dl[@class='dl-horizontal wxo-conds-col2'])]//text()")[0].replace("Â", "")
# Print Wind
#print(f"Wind in {city_name} is {Wind}")
Data should be in the following format:
Dew point:-2.3°C
Wind: NE 9 km/h
The direction for wind may change.
I’m not sure how to parse the following HTML code, thanks again for the help!
<dt>Temperature:</dt>
<dd class="mrgn-bttm-0 wxo-metric-hide">13.2°<abbr title="Celsius">C</abbr>
</dd>
<dd class="mrgn-bttm-0 wxo-imperial-hide wxo-city-hidden">55.8°
<abbr title="Fahrenheit">F</abbr>
</dd>
<dt>Dew point:</dt>
<dd class="mrgn-bttm-0 wxo-metric-hide">-2.3°<abbr title="Celsius">C</abbr>
</dd>
<dd class="mrgn-bttm-0 wxo-imperial-hide wxo-city-hidden">27.9°<abbr title="Fahrenheit">F</abbr>
</dd>
<dt>Humidity:</dt>
<dd class="mrgn-bttm-0">34%</dd>
</dl></div>
<div class="col-sm-4"><dl class="dl-horizontal wxo-conds-col3">
<dt>Wind:</dt>
<dd class="longContent mrgn-bttm-0 wxo-metric-hide">
<abbr title="Northeast">NE</abbr> 9 <abbr title="kilometres per hour">km/h</abbr>
</dd>
<dd class="longContent mrgn-bttm-0 wxo-imperial-hide wxo-city-hidden">
<abbr title="Northeast">NE</abbr> 6 <abbr title="miles per hour">mph</abbr>
</dd>
<dt>Visibility:</dt>
<dd class="mrgn-bttm-0 wxo-metric-hide">48 <abbr title="kilometres">km</abbr>
</dd>
<dd class="mrgn-bttm-0 wxo-imperial-hide wxo-city-hidden">30 miles</dd>
</dl></div>
Answers:
The best approach is to use BeautifulSoup to parse the page.
This is what you want:
import requests
from bs4 import BeautifulSoup
resp = requests.get("https://weather.gc.ca/city/pages/ab-52_metric_e.html").content
soup = BeautifulSoup(resp, "html.parser")
all_dt = soup.find_all("dt")
#if you want more metrics, just add it to the list
metrics = ["Dew point:","Wind:","Pressure:","Condition:","Tendency:","Temperature:", "Humidity:", "Visibility:"]
data = {}
for metric in metrics:
data[metric] = []
for elem in all_dt:
if elem.text in metrics:
value = elem.next_sibling.next_sibling
data[elem.text].append(value.text.strip("n") if value != None else "No Data")
print(data)
bs4’s BeautifulSoup is often used to parse html and extract data. In your case, it can be utilized in several ways; first parse and get the html tree [soup
] with
import requests
from bs4 import BeautifulSoup
resp = requests.get("https://weather.gc.ca/city/pages/ab-52_metric_e.html")
soup = BeautifulSoup(resp.content, "html.parser")
[I like using the select
function, and you can consult this reference if you’re confused by any of the selectors.]
Now, if you want the current weather, then
selector = '#mainContent details.visible-xs dt+dd'
for s in soup.select(selector):
print(s.find_previous_sibling().text.strip(), s.text.strip(), end=' ')
will output
Wind: NE 8 km/h Temperature: 18.2°C Pressure: 101.9 kPa Dew point: -8.1°C Visibility: 24 km Humidity: 16% Date: 8:00 PM MDT Wednesday 19 October 2022 Observed at: Calgary Int'l Airport
but you also commented about wanting the Condition and Tendency, and you could get the Condition simply with
print(f"Condition: {soup.select_one('details.visible-xs div>img+p')}")
but Tendency is actually in a different section which is not visible. You could change selector
to '#mainContent section:first-of-type dt+dd'
or even to just 'dt+dd'
and run for s in soup.select(selector)....
again, but you will several repeated values and unwanted data, so we probably need a more structured approach.
So I defined a function that will take a section of the tree and return one value per condition –
def getWeatherData(sectionSoup, pKeys='all', cwcSelector='dt+dd', pEnd=' '):
wc = []
for s in sectionSoup.select(cwcSelector):
wcName = s.find_previous_sibling().text.strip()
wcVal = s.text.strip()
vis = 'hidden' if s.find_parent(class_='hidden-xs') else 'visible'
wc.append((wcName, wcVal, vis))
wcs = sorted(wc, key=lambda c: c[2], reverse=True) # visible first
if pKeys == 'all' or type(pKeys) != list:
pKeys = list(set([c[0] for c in wc]))
allKeys = True
else: allKeys = False
forOp = []
for k in pKeys:
kvp = [c for c in wcs if c[0].replace(':','')==k.replace(':','')]
if kvp == []:
# continue # if you want to skip
if pEnd is None or type(pEnd) == str:
print(f'! UNAVAILABLE : "{k}" !', end=pEnd)
else:
if allKeys: k = k.replace(':', '') # remove if you want to preserve originial text
forOp.append((k, kvp[0][1]))
if pEnd is None or type(pEnd) == str:
print(kvp[0][0], kvp[0][1], end=pEnd)
print()
return dict(forOp)
# if you want all the values, including repeats:
# return {'filtered': forOp, 'unfiltered': wc}
Now you can define which parts of the data you want, and what order you want it in:
toPrint = [
'Condition', 'Pressure', 'Tendency', 'Temperature',
'Dew point', 'Humidity', 'Wind', 'Visibility'
] # the parts you mentioned wanting
cwc = getWeatherData(soup.select_one('#mainContent section'), toPrint)
print(f'n##############nAs Dictionary:n{cwc}')
and that will output
Condition: Partly Cloudy Pressure: 101.3 kPa Tendency: Rising Temperature: 8.9°C Dew point: -0.3°C Humidity: 52% Wind: N 43 gust 54 km/h Visibility: 24 km
##############
As Dictionary:
{'Condition': 'Partly Cloudy', 'Pressure': '101.3 kPa', 'Tendency': 'Rising', 'Temperature': '8.9°C', 'Dew point': '-0.3°C', 'Humidity': '52%', 'Wind': 'N 43 gust 54 km/h', 'Visibility': '24 km'}
[Send pEnd='n'
if you want each value printed on a separate line, or pEnd=False
if you don’t want to print at all; and send pKeys='all'
(instead of toPrint
) if you want to see all available data.]
If you wanted, you could also see all collapsible sections (hidden and visible) with
wcSects = [s.find_parent('details') for s in soup.select('details dd+dt')]
for s in list(set(wcSects)):
print('n#######', end=' ')
h2 = s.select_one('summary h2')
print((h2 if h2 else s.summary).text.strip(), '#######')
getWeatherData(s, 'all', pEnd=' | ')
I’m not able to get the Dew_Point and Wind Data from the site https://weather.gc.ca/city/pages/ab-52_metric_e.html
import requests
from lxml import html
# Get the html page
resp=requests.get("https://weather.gc.ca/city/pages/ab-52_metric_e.html")
# Build html tree
html_tree=html.fromstring(resp.text)
#Dew_point=html_tree.xpath("//dd[@class='mrgn-bttm-0 wxo-metric-hide'][(parent::dl[@class='dl-horizontal wxo-conds-col2'])]//text()")[1].replace("Â", "")
# Print Dew_point
#print(f"Dew_point in {city_name} is {Dew_point}")
#Wind=html_tree.xpath("//dd[@class='longContent mrgn-bttm-0 wxo-metric-hide'][(parent::dl[@class='dl-horizontal wxo-conds-col2'])]//text()")[0].replace("Â", "")
# Print Wind
#print(f"Wind in {city_name} is {Wind}")
Data should be in the following format:
Dew point:-2.3°C
Wind: NE 9 km/h
The direction for wind may change.
I’m not sure how to parse the following HTML code, thanks again for the help!
<dt>Temperature:</dt>
<dd class="mrgn-bttm-0 wxo-metric-hide">13.2°<abbr title="Celsius">C</abbr>
</dd>
<dd class="mrgn-bttm-0 wxo-imperial-hide wxo-city-hidden">55.8°
<abbr title="Fahrenheit">F</abbr>
</dd>
<dt>Dew point:</dt>
<dd class="mrgn-bttm-0 wxo-metric-hide">-2.3°<abbr title="Celsius">C</abbr>
</dd>
<dd class="mrgn-bttm-0 wxo-imperial-hide wxo-city-hidden">27.9°<abbr title="Fahrenheit">F</abbr>
</dd>
<dt>Humidity:</dt>
<dd class="mrgn-bttm-0">34%</dd>
</dl></div>
<div class="col-sm-4"><dl class="dl-horizontal wxo-conds-col3">
<dt>Wind:</dt>
<dd class="longContent mrgn-bttm-0 wxo-metric-hide">
<abbr title="Northeast">NE</abbr> 9 <abbr title="kilometres per hour">km/h</abbr>
</dd>
<dd class="longContent mrgn-bttm-0 wxo-imperial-hide wxo-city-hidden">
<abbr title="Northeast">NE</abbr> 6 <abbr title="miles per hour">mph</abbr>
</dd>
<dt>Visibility:</dt>
<dd class="mrgn-bttm-0 wxo-metric-hide">48 <abbr title="kilometres">km</abbr>
</dd>
<dd class="mrgn-bttm-0 wxo-imperial-hide wxo-city-hidden">30 miles</dd>
</dl></div>
The best approach is to use BeautifulSoup to parse the page.
This is what you want:
import requests
from bs4 import BeautifulSoup
resp = requests.get("https://weather.gc.ca/city/pages/ab-52_metric_e.html").content
soup = BeautifulSoup(resp, "html.parser")
all_dt = soup.find_all("dt")
#if you want more metrics, just add it to the list
metrics = ["Dew point:","Wind:","Pressure:","Condition:","Tendency:","Temperature:", "Humidity:", "Visibility:"]
data = {}
for metric in metrics:
data[metric] = []
for elem in all_dt:
if elem.text in metrics:
value = elem.next_sibling.next_sibling
data[elem.text].append(value.text.strip("n") if value != None else "No Data")
print(data)
bs4’s BeautifulSoup is often used to parse html and extract data. In your case, it can be utilized in several ways; first parse and get the html tree [soup
] with
import requests
from bs4 import BeautifulSoup
resp = requests.get("https://weather.gc.ca/city/pages/ab-52_metric_e.html")
soup = BeautifulSoup(resp.content, "html.parser")
[I like using the select
function, and you can consult this reference if you’re confused by any of the selectors.]
Now, if you want the current weather, then
selector = '#mainContent details.visible-xs dt+dd'
for s in soup.select(selector):
print(s.find_previous_sibling().text.strip(), s.text.strip(), end=' ')
will output
Wind: NE 8 km/h Temperature: 18.2°C Pressure: 101.9 kPa Dew point: -8.1°C Visibility: 24 km Humidity: 16% Date: 8:00 PM MDT Wednesday 19 October 2022 Observed at: Calgary Int'l Airport
but you also commented about wanting the Condition and Tendency, and you could get the Condition simply with
print(f"Condition: {soup.select_one('details.visible-xs div>img+p')}")
but Tendency is actually in a different section which is not visible. You could change selector
to '#mainContent section:first-of-type dt+dd'
or even to just 'dt+dd'
and run for s in soup.select(selector)....
again, but you will several repeated values and unwanted data, so we probably need a more structured approach.
So I defined a function that will take a section of the tree and return one value per condition –
def getWeatherData(sectionSoup, pKeys='all', cwcSelector='dt+dd', pEnd=' '):
wc = []
for s in sectionSoup.select(cwcSelector):
wcName = s.find_previous_sibling().text.strip()
wcVal = s.text.strip()
vis = 'hidden' if s.find_parent(class_='hidden-xs') else 'visible'
wc.append((wcName, wcVal, vis))
wcs = sorted(wc, key=lambda c: c[2], reverse=True) # visible first
if pKeys == 'all' or type(pKeys) != list:
pKeys = list(set([c[0] for c in wc]))
allKeys = True
else: allKeys = False
forOp = []
for k in pKeys:
kvp = [c for c in wcs if c[0].replace(':','')==k.replace(':','')]
if kvp == []:
# continue # if you want to skip
if pEnd is None or type(pEnd) == str:
print(f'! UNAVAILABLE : "{k}" !', end=pEnd)
else:
if allKeys: k = k.replace(':', '') # remove if you want to preserve originial text
forOp.append((k, kvp[0][1]))
if pEnd is None or type(pEnd) == str:
print(kvp[0][0], kvp[0][1], end=pEnd)
print()
return dict(forOp)
# if you want all the values, including repeats:
# return {'filtered': forOp, 'unfiltered': wc}
Now you can define which parts of the data you want, and what order you want it in:
toPrint = [
'Condition', 'Pressure', 'Tendency', 'Temperature',
'Dew point', 'Humidity', 'Wind', 'Visibility'
] # the parts you mentioned wanting
cwc = getWeatherData(soup.select_one('#mainContent section'), toPrint)
print(f'n##############nAs Dictionary:n{cwc}')
and that will output
Condition: Partly Cloudy Pressure: 101.3 kPa Tendency: Rising Temperature: 8.9°C Dew point: -0.3°C Humidity: 52% Wind: N 43 gust 54 km/h Visibility: 24 km
##############
As Dictionary:
{'Condition': 'Partly Cloudy', 'Pressure': '101.3 kPa', 'Tendency': 'Rising', 'Temperature': '8.9°C', 'Dew point': '-0.3°C', 'Humidity': '52%', 'Wind': 'N 43 gust 54 km/h', 'Visibility': '24 km'}
[Send pEnd='n'
if you want each value printed on a separate line, or pEnd=False
if you don’t want to print at all; and send pKeys='all'
(instead of toPrint
) if you want to see all available data.]
If you wanted, you could also see all collapsible sections (hidden and visible) with
wcSects = [s.find_parent('details') for s in soup.select('details dd+dt')]
for s in list(set(wcSects)):
print('n#######', end=' ')
h2 = s.select_one('summary h2')
print((h2 if h2 else s.summary).text.strip(), '#######')
getWeatherData(s, 'all', pEnd=' | ')