Timeout error while uploading to large file in adls
Question:
I need to upload a 200 mb file to adls using python.
I’m using the code provided in the official documentation – https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python?tabs=azure-ad
While calling the following function for upload –
def upload_file_to_directory_bulk():
try:
file_system_client = service_client.get_file_system_client(file_system="system")
directory_client = file_system_client.get_directory_client("my-directory")
file_client = directory_client.get_file_client("uploaded-file.txt")
local_file = open("C:\file-to-upload.txt",'r')
file_contents = local_file.read()
file_client.upload_data(file_contents, overwrite=True)
except Exception as e:
print(e)
It works for small files
I get the error – ('Connection aborted.', timeout('The write operation timed out'))
when I try to upload larger files like 200 mb.
How to resolve this?
Answers:
This must be related to the upload speed. Try increasing the timeout to 60 seconds. Also if you split the file in chunks a separate connection(with separate timeout) will be created for each chunk.
file_client.upload_data(file_contents, overwrite=True, timeout=60)
With chunk size:
file_client.upload_data(file_contents, overwrite=True, timeout=30, chunk_size=25)
You need to increase the timeout value and chunk size in your code while uploading large data.
import os, uuid, sys
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
import os, uuid, sys
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
def initialize_storage_account_ad(storage_account_name):
try:
global service_client
default_credential = DefaultAzureCredential()
service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
"https", storage_account_name), credential=default_credential)
except Exception as e:
print(e)
def upload_file_to_directory_bulk():
try:
file_system_client = service_client.get_file_system_client(file_system="<container-name>")
directory_client = file_system_client.get_directory_client("<directory>")
file_client = directory_client.get_file_client("uploaded-file.txt")
local_file = open("<file.txt>",'r')
file_contents = local_file.read()
#file_client.upload_data(file_contents, overwrite=True)
file_client.upload_data(file_contents, overwrite=True, timeout=60)
#with chunk size
file_client.upload_data(file_contents, overwrite=True, timeout=60)
except Exception as e:
print(e)
storage_account_name= "<storage-account-name>"
You can increase the timeout by 60 seconds refer below:-
#file_client.upload_data(file_contents, overwrite=True)
file_client.upload_data(file_contents, overwrite=True, timeout=60)
and for chunk size add this line:-
#with chunk size
file_client.upload_data(file_contents, overwrite=True, timeout=60)
Output:-
I need to upload a 200 mb file to adls using python.
I’m using the code provided in the official documentation – https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python?tabs=azure-ad
While calling the following function for upload –
def upload_file_to_directory_bulk():
try:
file_system_client = service_client.get_file_system_client(file_system="system")
directory_client = file_system_client.get_directory_client("my-directory")
file_client = directory_client.get_file_client("uploaded-file.txt")
local_file = open("C:\file-to-upload.txt",'r')
file_contents = local_file.read()
file_client.upload_data(file_contents, overwrite=True)
except Exception as e:
print(e)
It works for small files
I get the error – ('Connection aborted.', timeout('The write operation timed out'))
when I try to upload larger files like 200 mb.
How to resolve this?
This must be related to the upload speed. Try increasing the timeout to 60 seconds. Also if you split the file in chunks a separate connection(with separate timeout) will be created for each chunk.
file_client.upload_data(file_contents, overwrite=True, timeout=60)
With chunk size:
file_client.upload_data(file_contents, overwrite=True, timeout=30, chunk_size=25)
You need to increase the timeout value and chunk size in your code while uploading large data.
import os, uuid, sys
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
import os, uuid, sys
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
def initialize_storage_account_ad(storage_account_name):
try:
global service_client
default_credential = DefaultAzureCredential()
service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
"https", storage_account_name), credential=default_credential)
except Exception as e:
print(e)
def upload_file_to_directory_bulk():
try:
file_system_client = service_client.get_file_system_client(file_system="<container-name>")
directory_client = file_system_client.get_directory_client("<directory>")
file_client = directory_client.get_file_client("uploaded-file.txt")
local_file = open("<file.txt>",'r')
file_contents = local_file.read()
#file_client.upload_data(file_contents, overwrite=True)
file_client.upload_data(file_contents, overwrite=True, timeout=60)
#with chunk size
file_client.upload_data(file_contents, overwrite=True, timeout=60)
except Exception as e:
print(e)
storage_account_name= "<storage-account-name>"
You can increase the timeout by 60 seconds refer below:-
#file_client.upload_data(file_contents, overwrite=True)
file_client.upload_data(file_contents, overwrite=True, timeout=60)
and for chunk size add this line:-
#with chunk size
file_client.upload_data(file_contents, overwrite=True, timeout=60)
Output:-