Downloading Zip file from a webpage

Question:

I am trying to download the zip, which we can also download by clicking on "SCARICA I DATI CSV" on this webpage. I want to do this for 7000+ Italian municipalities by using beautiful soup.

Right now, I have the following code for one city/municipality:

city_name = "vandoies-vintl"
prov_name = "bz"

r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')

# this is where the code breaks. because the HTML body does not have any mention of "csv" whatsoever, which is weird.

csv = soup.find_all('a', attrs={'class':'pull-right csv'})
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")

I can’t find any mention of csv when checking using print(soup). Could someone please help? Thanks!

The following code works.

import pandas as pd
import numpy as np
import time

from bs4 import BeautifulSoup
import urllib.request
import re
import os
import urllib
import zipfile
import re

output_path = r"/Users/aartimalik/Dropbox/delphine-miscellaneous/italy/test"

munis = [("monale", "at"), ("portacomaro", "at")]

munis = pd.DataFrame(munis)

munis.columns = ['municipality_clean','prov_abb']

def remove_paren(string):
    return re.sub(r'(.*)', '', str(string))
munis['municipality_clean']= munis['municipality_clean'].apply(lambda x: remove_paren(x))
munis['municipality_clean'] = munis['municipality_clean'].str.strip()
munis = munis.replace(' ', '-', regex=True)
munis = munis.apply(lambda x: x.str.lower())

for i in range(0,len(munis)):
    city_name = munis.iloc[i]['municipality_clean']
    prov_name = munis.iloc[i]['prov_abb']
    
    try:
        r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
        soup = BeautifulSoup(r, 'lxml')
        csv = soup.find_all('a', attrs={'class':'pull-right csv'})
        try:
            csvlink = csv[0]['href']
            urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
            #print('Downloaded and extracted zip for ' + city_name + ', ' + prov_name)
            print(str(i) + ". " + city_name+": success")
            scrapesuccess = scrapesuccess.append(munis.iloc[i])

            newfolder= output_path + "/" + city_name.capitalize()
            if not os.path.exists(newfolder):
                os.makedirs(newfolder)

            zip_ref = zipfile.ZipFile(output_path + "/" + city_name + ".zip", 'r')
            zip_ref.extractall(newfolder)
            zip_ref.close()

        except:
            scrapefail = scrapefail.append(munis.iloc[i])
            print(str(i) + ". " + city_name+": fail")
    
    except:
        scrapefail = scrapefail.append(munis.iloc[i])
        print(str(i) + ". " + city_name+": fail")
Asked By: Pepa

||

Answers:

Heres an example of downloading the zip in memory and writing a city directory with all the csv files.

import urllib.request as request
from io import StringIO
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
from bs4 import BeautifulSoup


class Scraper:
    def __init__(self, **kwargs):
        self.url_root = "http://storico.openbilanci.it"
        self.city_name = kwargs.get("city_name")
        self.prov_name = kwargs.get("prov_name")

    def main(self) -> None:
        file_link = self.get_link()
        zipped_file = self.download_file(file_link)
        unzipped_files_mapping = self.unzip_file(zipped_file)
        self.write_files(unzipped_files_mapping)

    def get_link(self) -> str:
        url = f"{self.url_root}/bilanci/{self.city_name}-comune-{self.prov_name}"

        response = request.urlopen(url).read()
        soup = BeautifulSoup(response, "lxml")

        return soup.find_all("a", attrs={"class": "pull-right csv"})[0]["href"]

    def download_file(self, zip_link: str) -> str:
        url = f"{self.url_root}{zip_link}"

        return request.urlretrieve(url)[0]

    @staticmethod
    def unzip_file(file_handle: str) -> dict:
        zip_file_object = ZipFile(file_handle, "r")
        files = zip_file_object.namelist()

        return {
            file: pd.read_csv(StringIO(zip_file_object.open(file).read().decode("utf-8")), sep=";")
            for file in files
        }

    def write_files(self, file_mapping: dict) -> None:
        for file, df in file_mapping.items():
            file_path, file_name = file.rsplit("/", 1)
            path = Path(f"/path/to/files/{self.city_name}/{file_path}")
            path.mkdir(parents=True, exist_ok=True)
            df.to_csv(f"{path}/{file_name}")


city_name = "vandoies-vintl"
prov_name = "bz"
Scraper(city_name=city_name, prov_name=prov_name).main()
Answered By: Jason Baker
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.