Optimization: How to read csv data in python to a dict with converting strings in their booleans?

Question:

This is an optimization question as I have a working code that reads data from a csv file and created a python dictionary from it with strings like True or false converted to their boolean counterpart:

def _load_metadata(path):
    """Loads the metadata from the given file and converts boolean strings to booleans."""
    filedict = {}
    with open(path) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=",")
        for row in reader:
            newrow = {}
            for key, value in dict(row).items():
                if value.lower() == "false":
                    newrow[key] = False
                elif value.lower() == "true":
                    newrow[key] = True
                else:
                    newrow[key] = value
            filedict[row["name"]]= newrow 
    return filedict

But I wonder if there is a better/more pythonic way to handle this?

Asked By: Alex

||

Answers:

Soluthon from my comment:

def _load_metadata(path):
    """Loads the metadata from the given file and converts boolean strings to booleans."""
    filedict = {}
    with open(path) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=",")
        for row in reader:
            newrow = {}
            for key, value in dict(row).items():
                # if value.lower() not in {"false": False, "true": True} it returns value as default
                newrow[key] = {"false": False, "true": True}.get(value.lower(), value)

            filedict[row["name"]]= newrow
    return filedict

Here the doc about this dicts .get() method

Answered By: Andrey Topoleov

Your code is fine. In general you shouldn’t worry too much about optimizing pure-Python code. If you really need better performance, you should consider using packages specifically built for your task (or writing your own extension module in C, for example). Some of the popular tools out there for dealing with large data are NumPy, pandas, and PyArrow.

With pandas, you’d do something like this:

import pandas as pd

df = pd.read_csv(path, engine='pyarrow')
df = df.replace(r'(?i)true', True, regex=True)
df = df.replace(r'(?i)false', False, regex=True)
filedict = {row['name']: row.to_dict() for row in df.iloc}

Which unfortunately seems much slower than processing row-by-row in pure-Python, but I’m new to pandas and I’m sure there are better ways to do this.

As for pure-Python, some of the most obvious ways to optimize the original code did not yield significant improvements:

import csv
import random
import timeit


path = 'temp.csv'
N_ROWS = 100_000
N_TESTS = 100

cols = [chr(k) for k in range(ord('A'), ord('Z')+1)]
choices = ['value', 'True', 'False']

with open(path, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['name'] + cols)
    writer.writeheader()
    for k in range(N_ROWS):
        row = {key: random.choice(choices) for key in cols}
        row['name'] = f'name_{k}'
        writer.writerow(row)


def load_1(path):
    """Original function"""
    filedict = {}
    with open(path) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=",")
        for row in reader:
            newrow = {}
            for key, value in dict(row).items():
                if value.lower() == "false":
                    newrow[key] = False
                elif value.lower() == "true":
                    newrow[key] = True
                else:
                    newrow[key] = value
            filedict[row["name"]]= newrow 
    return filedict


def load_2(path):
    """With dict.get method"""
    bool_map = {'true': True, 'false': False}
    filedict = {}
    with open(path) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=",")
        for row in reader:
            newrow = {}
            for key, value in dict(row).items():
                newrow[key] = bool_map.get(value.lower(), value)
            filedict[row["name"]]= newrow 
    return filedict



def load_3(path):
    """With walrus operator"""
    filedict = {}
    with open(path) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=",")
        for row in reader:
            newrow = {}
            for key, value in dict(row).items():
                if (value_lower := value.lower()) == "false":
                    newrow[key] = False
                elif value_lower == "true":
                    newrow[key] = True
                else:
                    newrow[key] = value
            filedict[row["name"]]= newrow 
    return filedict


def load_4(path):
    """With walrus operator and dict comprehension"""
    filedict = {}
    with open(path) as csvfile:
        for row in csv.DictReader(csvfile, delimiter=","):
            filedict[row["name"]] = {
                key: True if (value_str:=value.lower()) == 'true'
                     else False if value_str == 'false'
                     else value
                for key, value in row.items()
            }
    return filedict


def load_5(path):
    """With walrus operator and nested dict comprehension"""
    with open(path) as csvfile:
        return {
            row['name']: {
                key: True if (value_str:=value.lower()) == 'true'
                     else False if value_str == 'false'
                     else value
                for key, value in row.items()
            }
            for row in csv.DictReader(csvfile, delimiter=',')
        }


assert load_1(path) == load_2(path) == load_3(path) == 
       load_4(path) == load_5(path)

t1 = timeit.timeit('load_1(path)', globals=globals(), number=N_TESTS)/N_TESTS
t2 = timeit.timeit('load_2(path)', globals=globals(), number=N_TESTS)/N_TESTS
t3 = timeit.timeit('load_3(path)', globals=globals(), number=N_TESTS)/N_TESTS
t4 = timeit.timeit('load_4(path)', globals=globals(), number=N_TESTS)/N_TESTS
t5 = timeit.timeit('load_5(path)', globals=globals(), number=N_TESTS)/N_TESTS
print(f'{t1 = :.3f} s')
print(f'{t2 = :.3f} s')
print(f'{t3 = :.3f} s')
print(f'{t4 = :.3f} s')
print(f'{t5 = :.3f} s')

On my (very slow) computer the results were:

t1 = 2.504 s
t2 = 2.376 s
t3 = 2.288 s
t4 = 2.284 s
t5 = 2.260 s

The original code is definitely more "pythonic" in my opinion, and only about 10% slower.

Answered By: Wood
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.