Need help debugging my for loop nested in an if statement nested in a while loop, trying to find out why my list is not being appended to
Question:
import requests
from bs4 import BeautifulSoup
import time
import requests
def get_url(*args):
matched_index =[]
url_site = []
for a in args:
city_country_match = False
city_country = a.replace(" ","-")
sitemap_index = 5
sitemap_index_max = 255
while not city_country_match:
res = requests.get(f'https://weatherspark.com/sitemap- {sitemap_index}.xml')
soup = BeautifulSoup(res.content,"xml")
if city_country in res.text: #is city/country within the sitemap.xml page?
city_country_match = True #ends while loop
for loc in soup.select('loc'):
text = loc.text
if 'Average-Weather-in-' in text and 'Year-Round' in text and city_country in text:
url_site.append(text) #***WHY DOES THIS NOT APPEND? ***
print(f'Found {a} on page {sitemap_index}')
matched_index.append(sitemap_index)
url_site.append(text)
time.sleep(1.5)
else:
sitemap_index = sitemap_index + 1
if sitemap_index == sitemap_index_max:
print(f"Did not find {city_country}")
return matched_index
return url_site
get_url("Austin Texas United States, "Calgary Canada")
Actual Result:
Found Austin Texas United States on page 20
Found Calgary Canada on page 9
Output: [20,9]
Expected Result is it would have also included in the output:
["https://weatherspark.com/y/8004/Average-Weather-in-Austin-Texas-United-States-Year-Round", "https://weatherspark.com/y/2349/Average-Weather-in-Calgary-Canada-Year-Round"]
Answers:
You can try to return with:
return matched_index, url_site
The reason your code will not work currently is because you have two return statements, which means the second return url_site will never be reached.
import requests
from bs4 import BeautifulSoup
import time
import requests
def get_url(*args):
matched_index =[]
url_site = []
for a in args:
city_country_match = False
city_country = a.replace(" ","-")
sitemap_index = 5
sitemap_index_max = 255
while not city_country_match:
res = requests.get(f'https://weatherspark.com/sitemap- {sitemap_index}.xml')
soup = BeautifulSoup(res.content,"xml")
if city_country in res.text: #is city/country within the sitemap.xml page?
city_country_match = True #ends while loop
for loc in soup.select('loc'):
text = loc.text
if 'Average-Weather-in-' in text and 'Year-Round' in text and city_country in text:
url_site.append(text) #***WHY DOES THIS NOT APPEND? ***
print(f'Found {a} on page {sitemap_index}')
matched_index.append(sitemap_index)
url_site.append(text)
time.sleep(1.5)
else:
sitemap_index = sitemap_index + 1
if sitemap_index == sitemap_index_max:
print(f"Did not find {city_country}")
return matched_index
return url_site
get_url("Austin Texas United States, "Calgary Canada")
Actual Result:
Found Austin Texas United States on page 20
Found Calgary Canada on page 9
Output: [20,9]
Expected Result is it would have also included in the output:
["https://weatherspark.com/y/8004/Average-Weather-in-Austin-Texas-United-States-Year-Round", "https://weatherspark.com/y/2349/Average-Weather-in-Calgary-Canada-Year-Round"]
You can try to return with:
return matched_index, url_site
The reason your code will not work currently is because you have two return statements, which means the second return url_site will never be reached.