How to handle Transport error in elastic search python client?
Question:
I am uploading pandas dataframe to elastic (using elasticsearch==6.3.1), if the dataframe size is less 100MB it works fine, i am using a solution from How to export pandas data to elasticsearch?
def rec_to_actions(df):
for record in df.to_dict(orient="records"):
yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}'% (INDEX, TYPE))
yield (json.dumps(record, default=int))
from elasticsearch import Elasticsearch
e = Elasticsearch([{'host': 'localhost', 'port': 9200}])
r = e.bulk(rec_to_actions(df))
This works perfect, but for dataframe over 100MB it throws Transport Error
raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code, error_message, additional_info)
TransportError: TransportError(413, '')
How to handle this?
i tried setting http.max_content_length: 350mb in elasticsearch.yml file, but still getting error.
Also one more question is how to add a timestamp field along with the above function?
Answers:
You can send them in batches or use Parallel bulk, example:
from elasticsearch import helpers
results = list(helpers.parallel_bulk(e, generator_fn() , thread_count=2, chunk_size=400, request_timeout=1000, raise_on_error=False))
Another option is to chunk the dataframe, essentially uploading to elastic search in smaller batches.
import json
from elasticsearch import Elasticsearch
def rec_to_actions(df, index):
for record in df.to_dict(orient="records"):
yield '{ "index" : { "_index" : "%s", "_type": "_doc"}}' % index
yield json.dumps(record, default=int)
def push_to_elasticsearch(df,
index,
es_params):
es = Elasticsearch(
hosts=[dict(
host=es_params['host'],
port=es_params['port'],
scheme='https'
)],
http_auth=(es_params['username'], es_params['password'])
)
r = es.bulk(rec_to_actions(df=df,
index=index))
print(f'--> {index} Data upload complete :: ElasticSearch')
return r
def split_dataframe(df, chunk_size = 10000):
chunks = list()
num_chunks = len(df) // chunk_size + 1
for i in range(num_chunks):
chunks.append(df[i*chunk_size:(i+1)*chunk_size])
return chunks
chunked_df = split_dataframe(df = your_dataframe)
for chunk in chunked_df:
push_to_elasticsearch(df = chunk,
index = es_index,
es_params = elastic_search_params)
I am uploading pandas dataframe to elastic (using elasticsearch==6.3.1), if the dataframe size is less 100MB it works fine, i am using a solution from How to export pandas data to elasticsearch?
def rec_to_actions(df):
for record in df.to_dict(orient="records"):
yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}'% (INDEX, TYPE))
yield (json.dumps(record, default=int))
from elasticsearch import Elasticsearch
e = Elasticsearch([{'host': 'localhost', 'port': 9200}])
r = e.bulk(rec_to_actions(df))
This works perfect, but for dataframe over 100MB it throws Transport Error
raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code, error_message, additional_info)
TransportError: TransportError(413, '')
How to handle this?
i tried setting http.max_content_length: 350mb in elasticsearch.yml file, but still getting error.
Also one more question is how to add a timestamp field along with the above function?
You can send them in batches or use Parallel bulk, example:
from elasticsearch import helpers
results = list(helpers.parallel_bulk(e, generator_fn() , thread_count=2, chunk_size=400, request_timeout=1000, raise_on_error=False))
Another option is to chunk the dataframe, essentially uploading to elastic search in smaller batches.
import json
from elasticsearch import Elasticsearch
def rec_to_actions(df, index):
for record in df.to_dict(orient="records"):
yield '{ "index" : { "_index" : "%s", "_type": "_doc"}}' % index
yield json.dumps(record, default=int)
def push_to_elasticsearch(df,
index,
es_params):
es = Elasticsearch(
hosts=[dict(
host=es_params['host'],
port=es_params['port'],
scheme='https'
)],
http_auth=(es_params['username'], es_params['password'])
)
r = es.bulk(rec_to_actions(df=df,
index=index))
print(f'--> {index} Data upload complete :: ElasticSearch')
return r
def split_dataframe(df, chunk_size = 10000):
chunks = list()
num_chunks = len(df) // chunk_size + 1
for i in range(num_chunks):
chunks.append(df[i*chunk_size:(i+1)*chunk_size])
return chunks
chunked_df = split_dataframe(df = your_dataframe)
for chunk in chunked_df:
push_to_elasticsearch(df = chunk,
index = es_index,
es_params = elastic_search_params)