Optimization: How to read csv data in python to a dict with converting strings in their booleans?
Question:
This is an optimization question as I have a working code that reads data from a csv file and created a python dictionary from it with strings like True
or false
converted to their boolean counterpart:
def _load_metadata(path):
"""Loads the metadata from the given file and converts boolean strings to booleans."""
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
if value.lower() == "false":
newrow[key] = False
elif value.lower() == "true":
newrow[key] = True
else:
newrow[key] = value
filedict[row["name"]]= newrow
return filedict
But I wonder if there is a better/more pythonic way to handle this?
Answers:
Soluthon from my comment:
def _load_metadata(path):
"""Loads the metadata from the given file and converts boolean strings to booleans."""
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
# if value.lower() not in {"false": False, "true": True} it returns value as default
newrow[key] = {"false": False, "true": True}.get(value.lower(), value)
filedict[row["name"]]= newrow
return filedict
Here the doc about this dicts .get()
method
Your code is fine. In general you shouldn’t worry too much about optimizing pure-Python code. If you really need better performance, you should consider using packages specifically built for your task (or writing your own extension module in C, for example). Some of the popular tools out there for dealing with large data are NumPy, pandas, and PyArrow.
With pandas
, you’d do something like this:
import pandas as pd
df = pd.read_csv(path, engine='pyarrow')
df = df.replace(r'(?i)true', True, regex=True)
df = df.replace(r'(?i)false', False, regex=True)
filedict = {row['name']: row.to_dict() for row in df.iloc}
Which unfortunately seems much slower than processing row-by-row in pure-Python, but I’m new to pandas
and I’m sure there are better ways to do this.
As for pure-Python, some of the most obvious ways to optimize the original code did not yield significant improvements:
import csv
import random
import timeit
path = 'temp.csv'
N_ROWS = 100_000
N_TESTS = 100
cols = [chr(k) for k in range(ord('A'), ord('Z')+1)]
choices = ['value', 'True', 'False']
with open(path, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['name'] + cols)
writer.writeheader()
for k in range(N_ROWS):
row = {key: random.choice(choices) for key in cols}
row['name'] = f'name_{k}'
writer.writerow(row)
def load_1(path):
"""Original function"""
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
if value.lower() == "false":
newrow[key] = False
elif value.lower() == "true":
newrow[key] = True
else:
newrow[key] = value
filedict[row["name"]]= newrow
return filedict
def load_2(path):
"""With dict.get method"""
bool_map = {'true': True, 'false': False}
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
newrow[key] = bool_map.get(value.lower(), value)
filedict[row["name"]]= newrow
return filedict
def load_3(path):
"""With walrus operator"""
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
if (value_lower := value.lower()) == "false":
newrow[key] = False
elif value_lower == "true":
newrow[key] = True
else:
newrow[key] = value
filedict[row["name"]]= newrow
return filedict
def load_4(path):
"""With walrus operator and dict comprehension"""
filedict = {}
with open(path) as csvfile:
for row in csv.DictReader(csvfile, delimiter=","):
filedict[row["name"]] = {
key: True if (value_str:=value.lower()) == 'true'
else False if value_str == 'false'
else value
for key, value in row.items()
}
return filedict
def load_5(path):
"""With walrus operator and nested dict comprehension"""
with open(path) as csvfile:
return {
row['name']: {
key: True if (value_str:=value.lower()) == 'true'
else False if value_str == 'false'
else value
for key, value in row.items()
}
for row in csv.DictReader(csvfile, delimiter=',')
}
assert load_1(path) == load_2(path) == load_3(path) ==
load_4(path) == load_5(path)
t1 = timeit.timeit('load_1(path)', globals=globals(), number=N_TESTS)/N_TESTS
t2 = timeit.timeit('load_2(path)', globals=globals(), number=N_TESTS)/N_TESTS
t3 = timeit.timeit('load_3(path)', globals=globals(), number=N_TESTS)/N_TESTS
t4 = timeit.timeit('load_4(path)', globals=globals(), number=N_TESTS)/N_TESTS
t5 = timeit.timeit('load_5(path)', globals=globals(), number=N_TESTS)/N_TESTS
print(f'{t1 = :.3f} s')
print(f'{t2 = :.3f} s')
print(f'{t3 = :.3f} s')
print(f'{t4 = :.3f} s')
print(f'{t5 = :.3f} s')
On my (very slow) computer the results were:
t1 = 2.504 s
t2 = 2.376 s
t3 = 2.288 s
t4 = 2.284 s
t5 = 2.260 s
The original code is definitely more "pythonic" in my opinion, and only about 10% slower.
This is an optimization question as I have a working code that reads data from a csv file and created a python dictionary from it with strings like True
or false
converted to their boolean counterpart:
def _load_metadata(path):
"""Loads the metadata from the given file and converts boolean strings to booleans."""
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
if value.lower() == "false":
newrow[key] = False
elif value.lower() == "true":
newrow[key] = True
else:
newrow[key] = value
filedict[row["name"]]= newrow
return filedict
But I wonder if there is a better/more pythonic way to handle this?
Soluthon from my comment:
def _load_metadata(path):
"""Loads the metadata from the given file and converts boolean strings to booleans."""
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
# if value.lower() not in {"false": False, "true": True} it returns value as default
newrow[key] = {"false": False, "true": True}.get(value.lower(), value)
filedict[row["name"]]= newrow
return filedict
Here the doc about this dicts .get()
method
Your code is fine. In general you shouldn’t worry too much about optimizing pure-Python code. If you really need better performance, you should consider using packages specifically built for your task (or writing your own extension module in C, for example). Some of the popular tools out there for dealing with large data are NumPy, pandas, and PyArrow.
With pandas
, you’d do something like this:
import pandas as pd
df = pd.read_csv(path, engine='pyarrow')
df = df.replace(r'(?i)true', True, regex=True)
df = df.replace(r'(?i)false', False, regex=True)
filedict = {row['name']: row.to_dict() for row in df.iloc}
Which unfortunately seems much slower than processing row-by-row in pure-Python, but I’m new to pandas
and I’m sure there are better ways to do this.
As for pure-Python, some of the most obvious ways to optimize the original code did not yield significant improvements:
import csv
import random
import timeit
path = 'temp.csv'
N_ROWS = 100_000
N_TESTS = 100
cols = [chr(k) for k in range(ord('A'), ord('Z')+1)]
choices = ['value', 'True', 'False']
with open(path, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['name'] + cols)
writer.writeheader()
for k in range(N_ROWS):
row = {key: random.choice(choices) for key in cols}
row['name'] = f'name_{k}'
writer.writerow(row)
def load_1(path):
"""Original function"""
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
if value.lower() == "false":
newrow[key] = False
elif value.lower() == "true":
newrow[key] = True
else:
newrow[key] = value
filedict[row["name"]]= newrow
return filedict
def load_2(path):
"""With dict.get method"""
bool_map = {'true': True, 'false': False}
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
newrow[key] = bool_map.get(value.lower(), value)
filedict[row["name"]]= newrow
return filedict
def load_3(path):
"""With walrus operator"""
filedict = {}
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
newrow = {}
for key, value in dict(row).items():
if (value_lower := value.lower()) == "false":
newrow[key] = False
elif value_lower == "true":
newrow[key] = True
else:
newrow[key] = value
filedict[row["name"]]= newrow
return filedict
def load_4(path):
"""With walrus operator and dict comprehension"""
filedict = {}
with open(path) as csvfile:
for row in csv.DictReader(csvfile, delimiter=","):
filedict[row["name"]] = {
key: True if (value_str:=value.lower()) == 'true'
else False if value_str == 'false'
else value
for key, value in row.items()
}
return filedict
def load_5(path):
"""With walrus operator and nested dict comprehension"""
with open(path) as csvfile:
return {
row['name']: {
key: True if (value_str:=value.lower()) == 'true'
else False if value_str == 'false'
else value
for key, value in row.items()
}
for row in csv.DictReader(csvfile, delimiter=',')
}
assert load_1(path) == load_2(path) == load_3(path) ==
load_4(path) == load_5(path)
t1 = timeit.timeit('load_1(path)', globals=globals(), number=N_TESTS)/N_TESTS
t2 = timeit.timeit('load_2(path)', globals=globals(), number=N_TESTS)/N_TESTS
t3 = timeit.timeit('load_3(path)', globals=globals(), number=N_TESTS)/N_TESTS
t4 = timeit.timeit('load_4(path)', globals=globals(), number=N_TESTS)/N_TESTS
t5 = timeit.timeit('load_5(path)', globals=globals(), number=N_TESTS)/N_TESTS
print(f'{t1 = :.3f} s')
print(f'{t2 = :.3f} s')
print(f'{t3 = :.3f} s')
print(f'{t4 = :.3f} s')
print(f'{t5 = :.3f} s')
On my (very slow) computer the results were:
t1 = 2.504 s
t2 = 2.376 s
t3 = 2.288 s
t4 = 2.284 s
t5 = 2.260 s
The original code is definitely more "pythonic" in my opinion, and only about 10% slower.