Read csv from Azure blob Storage and store in a DataFrame
Question:
I’m trying to read multiple CSV files from blob storage using python.
The code that I’m using is:
blob_service_client = BlobServiceClient.from_connection_string(connection_str)
container_client = blob_service_client.get_container_client(container)
blobs_list = container_client.list_blobs(folder_root)
for blob in blobs_list:
blob_client = blob_service_client.get_blob_client(container=container, blob="blob.name")
stream = blob_client.download_blob().content_as_text()
I’m not sure what is the correct way to store the CSV files read in a pandas dataframe.
I tried to use:
df = df.append(pd.read_csv(StringIO(stream)))
But this shows me an error.
Any idea how can I to do this?
Answers:
You could download the file from blob storage, then read the data into a pandas DataFrame from the downloaded file.
from azure.storage.blob import BlockBlobService
import pandas as pd
import tables
STORAGEACCOUNTNAME= <storage_account_name>
STORAGEACCOUNTKEY= <storage_account_key>
LOCALFILENAME= <local_file_name>
CONTAINERNAME= <container_name>
BLOBNAME= <blob_name>
#download from blob
t1=time.time()
blob_service=BlockBlobService(account_name=STORAGEACCOUNTNAME,account_key=STORAGEACCOUNTKEY)
blob_service.get_blob_to_path(CONTAINERNAME,BLOBNAME,LOCALFILENAME)
t2=time.time()
print(("It takes %s seconds to download "+blobname) % (t2 - t1))
# LOCALFILE is the file path
dataframe_blobdata = pd.read_csv(LOCALFILENAME)
For more details, see here.
If you want to do the conversion directly, the code will help. You need to get content from the blob object and in the get_blob_to_text
there’s no need for the local file name.
from io import StringIO
blobstring = blob_service.get_blob_to_text(CONTAINERNAME,BLOBNAME).content
df = pd.read_csv(StringIO(blobstring))
import pandas as pd
data = pd.read_csv('blob_sas_url')
The Blob SAS Url can be found by right clicking on the azure portal’s blob file that you want to import and selecting Generate SAS. Then, click Generate SAS token and URL button and copy the SAS url to above code in place of blob_sas_url.
You can now directly read from BlobStorage into a Pandas DataFrame:
mydata = pd.read_csv(
f"abfs://{blob_path}",
storage_options={
"connection_string": os.environ["STORAGE_CONNECTION"]
})
where blob_path
is the path to your file, given as {container-name}/{blob-preifx.csv}
The BlockBlobService as part of azure-storage is deprecated. Use below instead:
!pip install azure-storage-blob
from azure.storage.blob import BlobServiceClient
import pandas as pd
STORAGEACCOUNTURL= <storage_account_url>
STORAGEACCOUNTKEY= <storage_account_key>
LOCALFILENAME= <local_file_name>
CONTAINERNAME= <container_name>
BLOBNAME= <blob_name>
#download from blob
blob_service_client_instance=BlobServiceClient(account_url=STORAGEACCOUNTURL, credential=STORAGEACCOUNTKEY)
blob_client_instance = blob_service_client_instance.get_blob_client(CONTAINERNAME, BLOBNAME, snapshot=None)
with open(LOCALFILENAME, "wb") as my_blob:
blob_data = blob_client_instance.download_blob()
blob_data.readinto(my_blob)
#import blob to dataframe
df = pd.read_csv(LOCALFILENAME)
LOCALFILENAME is the same as BLOBNAME
BlockBlobService is indeed deprecated. However, @Deepak’s answer doesn’t work for me. Below works:
import pandas as pd
from io import BytesIO
from azure.storage.blob import BlobServiceClient
CONNECTION_STRING= <connection_string>
CONTAINERNAME= <container_name>
BLOBNAME= <blob_name>
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
container_client = blob_service_client.get_container_client(CONTAINERNAME)
blob_client = container_client.get_blob_client(BLOBNAME)
with BytesIO() as input_blob:
blob_client.download_blob().download_to_stream(input_blob)
input_blob.seek(0)
df = pd.read_csv(input_blob)
You can use blob_client to read the file as text and use that text as input to pandas read_csv()
method. For Example
import pandas as pd
from io import StringIO
from azure.identity import InteractiveBrowserCredential
from azure.storage.blob import BlobServiceClient, ContainerClient
# name of the file
file_name = 'sample_file.csv'
# Note:- include folders if you have a folder structure in the blob
# container ex: -> main/child/sample.csv
# storage account URL
STORAGE_ACCOUNT_URL = 'https://sampleblob.blob.core.windows.net'
# name of the container that holds your CSV file
BLOB_STORAGE_CONTAINER_NAME = "sample-storage-container"
# Here I am using the interactive credential, you may use any other credential
CREDENTIAL = InteractiveBrowserCredential()
# Create the BlobServiceClient object
blob_service_client = BlobServiceClient(STORAGE_ACCOUNT_URL, credential=CREDENTIAL)
container_client = blob_service_client.get_container_client(container=BLOB_STORAGE_CONTAINER_NAME)
blob_client = container_client.get_blob_client(file_name)
if blob_client.exists() # check if blob exists
download_stream = blob_client.download_blob() # read file
df = pd.read_csv(StringIO(download_stream.content_as_text())) # use text as input to pandas
print(f"Shape of File {file_name} is {df.shape}")
I’m trying to read multiple CSV files from blob storage using python.
The code that I’m using is:
blob_service_client = BlobServiceClient.from_connection_string(connection_str)
container_client = blob_service_client.get_container_client(container)
blobs_list = container_client.list_blobs(folder_root)
for blob in blobs_list:
blob_client = blob_service_client.get_blob_client(container=container, blob="blob.name")
stream = blob_client.download_blob().content_as_text()
I’m not sure what is the correct way to store the CSV files read in a pandas dataframe.
I tried to use:
df = df.append(pd.read_csv(StringIO(stream)))
But this shows me an error.
Any idea how can I to do this?
You could download the file from blob storage, then read the data into a pandas DataFrame from the downloaded file.
from azure.storage.blob import BlockBlobService
import pandas as pd
import tables
STORAGEACCOUNTNAME= <storage_account_name>
STORAGEACCOUNTKEY= <storage_account_key>
LOCALFILENAME= <local_file_name>
CONTAINERNAME= <container_name>
BLOBNAME= <blob_name>
#download from blob
t1=time.time()
blob_service=BlockBlobService(account_name=STORAGEACCOUNTNAME,account_key=STORAGEACCOUNTKEY)
blob_service.get_blob_to_path(CONTAINERNAME,BLOBNAME,LOCALFILENAME)
t2=time.time()
print(("It takes %s seconds to download "+blobname) % (t2 - t1))
# LOCALFILE is the file path
dataframe_blobdata = pd.read_csv(LOCALFILENAME)
For more details, see here.
If you want to do the conversion directly, the code will help. You need to get content from the blob object and in the get_blob_to_text
there’s no need for the local file name.
from io import StringIO
blobstring = blob_service.get_blob_to_text(CONTAINERNAME,BLOBNAME).content
df = pd.read_csv(StringIO(blobstring))
import pandas as pd
data = pd.read_csv('blob_sas_url')
The Blob SAS Url can be found by right clicking on the azure portal’s blob file that you want to import and selecting Generate SAS. Then, click Generate SAS token and URL button and copy the SAS url to above code in place of blob_sas_url.
You can now directly read from BlobStorage into a Pandas DataFrame:
mydata = pd.read_csv(
f"abfs://{blob_path}",
storage_options={
"connection_string": os.environ["STORAGE_CONNECTION"]
})
where blob_path
is the path to your file, given as {container-name}/{blob-preifx.csv}
The BlockBlobService as part of azure-storage is deprecated. Use below instead:
!pip install azure-storage-blob
from azure.storage.blob import BlobServiceClient
import pandas as pd
STORAGEACCOUNTURL= <storage_account_url>
STORAGEACCOUNTKEY= <storage_account_key>
LOCALFILENAME= <local_file_name>
CONTAINERNAME= <container_name>
BLOBNAME= <blob_name>
#download from blob
blob_service_client_instance=BlobServiceClient(account_url=STORAGEACCOUNTURL, credential=STORAGEACCOUNTKEY)
blob_client_instance = blob_service_client_instance.get_blob_client(CONTAINERNAME, BLOBNAME, snapshot=None)
with open(LOCALFILENAME, "wb") as my_blob:
blob_data = blob_client_instance.download_blob()
blob_data.readinto(my_blob)
#import blob to dataframe
df = pd.read_csv(LOCALFILENAME)
LOCALFILENAME is the same as BLOBNAME
BlockBlobService is indeed deprecated. However, @Deepak’s answer doesn’t work for me. Below works:
import pandas as pd
from io import BytesIO
from azure.storage.blob import BlobServiceClient
CONNECTION_STRING= <connection_string>
CONTAINERNAME= <container_name>
BLOBNAME= <blob_name>
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
container_client = blob_service_client.get_container_client(CONTAINERNAME)
blob_client = container_client.get_blob_client(BLOBNAME)
with BytesIO() as input_blob:
blob_client.download_blob().download_to_stream(input_blob)
input_blob.seek(0)
df = pd.read_csv(input_blob)
You can use blob_client to read the file as text and use that text as input to pandas read_csv()
method. For Example
import pandas as pd
from io import StringIO
from azure.identity import InteractiveBrowserCredential
from azure.storage.blob import BlobServiceClient, ContainerClient
# name of the file
file_name = 'sample_file.csv'
# Note:- include folders if you have a folder structure in the blob
# container ex: -> main/child/sample.csv
# storage account URL
STORAGE_ACCOUNT_URL = 'https://sampleblob.blob.core.windows.net'
# name of the container that holds your CSV file
BLOB_STORAGE_CONTAINER_NAME = "sample-storage-container"
# Here I am using the interactive credential, you may use any other credential
CREDENTIAL = InteractiveBrowserCredential()
# Create the BlobServiceClient object
blob_service_client = BlobServiceClient(STORAGE_ACCOUNT_URL, credential=CREDENTIAL)
container_client = blob_service_client.get_container_client(container=BLOB_STORAGE_CONTAINER_NAME)
blob_client = container_client.get_blob_client(file_name)
if blob_client.exists() # check if blob exists
download_stream = blob_client.download_blob() # read file
df = pd.read_csv(StringIO(download_stream.content_as_text())) # use text as input to pandas
print(f"Shape of File {file_name} is {df.shape}")