dask: return None or empty from delayed task
Question:
I would like to return an empty dataframe/ None from a set of delayed tasks where parsing fails, e.g.;
import dask.dataframe as dd
import dask.delayed
def _read(self, filename):
try:
df = pd.read_csv(filename, sep=';', decimal=',', encoding='latin1', index_col=False)
return df
except BaseException as e:
return pd.DataFrame()
tasks = []
for root, dirs, files in os.walk(os.path.join(self._path, "files")):
for file in files:
tasks.append(dask.delayed(_read, pure=True)(os.path.join(root, file)))
ddf = dd.from_delayed(tasks)
One or two of the files fail being parsed, and at the moment I receive a metadata mismatch. I could return a dataframe with the dask dataframe metadata specified, but just wondering if there’s a better way.
Answers:
Going with the comment from @mdurant, it’s not as easy as you’d expect to copy a dataframe maintaining types, but this seems to work. This wouldn’t work if your first file errors out, of course.
import dask.dataframe as dd
import dask.delayed
_default_record = None
def _read(self, filename):
global _default_record
try:
df = pd.read_csv(filename, sep=';', decimal=',', encoding='latin1', index_col=False)
if _default_record is None:
_default_record = pd.DataFrame.from_items([
(name, pd.Series(data=None, dtype=series.dtype))
for name, series in df.head(1).iteritems()])
return df
except BaseException as e:
return _default_record
tasks = []
for root, dirs, files in os.walk(os.path.join(self._path, "files")):
for file in files:
tasks.append(dask.delayed(_read, pure=True)(os.path.join(root, file)))
ddf = dd.from_delayed(tasks)
Answer from @morganics Updated for what I’m assuming are newer versions of pandas(1.1.5) and dask (2020.12.0).
import dask.dataframe as dd
import dask.delayed
import pandas as pd
_default_record = None
def _read(self, filename):
global _default_record
try:
df = pd.read_csv(filename, sep=';', decimal=',', encoding='latin1', index_col=False)
if _default_record is None:
_default_record = pd.DataFrame([
{name: pd.Series(data=None, dtype=series.dtype)
for name, series in df.head(1).iteritems()})
return df
except BaseException as e:
return _default_record
tasks = []
for root, dirs, files in os.walk(os.path.join(self._path, "files")):
for file in files:
tasks.append(dask.delayed(_read, pure=True)(os.path.join(root, file)))
ddf = dd.from_delayed(tasks)
I made _default_record a global variable, and I removed the from_items
method from the dataframe constructor, because it doesn’t exist in my version of pandas.
I would like to return an empty dataframe/ None from a set of delayed tasks where parsing fails, e.g.;
import dask.dataframe as dd
import dask.delayed
def _read(self, filename):
try:
df = pd.read_csv(filename, sep=';', decimal=',', encoding='latin1', index_col=False)
return df
except BaseException as e:
return pd.DataFrame()
tasks = []
for root, dirs, files in os.walk(os.path.join(self._path, "files")):
for file in files:
tasks.append(dask.delayed(_read, pure=True)(os.path.join(root, file)))
ddf = dd.from_delayed(tasks)
One or two of the files fail being parsed, and at the moment I receive a metadata mismatch. I could return a dataframe with the dask dataframe metadata specified, but just wondering if there’s a better way.
Going with the comment from @mdurant, it’s not as easy as you’d expect to copy a dataframe maintaining types, but this seems to work. This wouldn’t work if your first file errors out, of course.
import dask.dataframe as dd
import dask.delayed
_default_record = None
def _read(self, filename):
global _default_record
try:
df = pd.read_csv(filename, sep=';', decimal=',', encoding='latin1', index_col=False)
if _default_record is None:
_default_record = pd.DataFrame.from_items([
(name, pd.Series(data=None, dtype=series.dtype))
for name, series in df.head(1).iteritems()])
return df
except BaseException as e:
return _default_record
tasks = []
for root, dirs, files in os.walk(os.path.join(self._path, "files")):
for file in files:
tasks.append(dask.delayed(_read, pure=True)(os.path.join(root, file)))
ddf = dd.from_delayed(tasks)
Answer from @morganics Updated for what I’m assuming are newer versions of pandas(1.1.5) and dask (2020.12.0).
import dask.dataframe as dd
import dask.delayed
import pandas as pd
_default_record = None
def _read(self, filename):
global _default_record
try:
df = pd.read_csv(filename, sep=';', decimal=',', encoding='latin1', index_col=False)
if _default_record is None:
_default_record = pd.DataFrame([
{name: pd.Series(data=None, dtype=series.dtype)
for name, series in df.head(1).iteritems()})
return df
except BaseException as e:
return _default_record
tasks = []
for root, dirs, files in os.walk(os.path.join(self._path, "files")):
for file in files:
tasks.append(dask.delayed(_read, pure=True)(os.path.join(root, file)))
ddf = dd.from_delayed(tasks)
I made _default_record a global variable, and I removed the from_items
method from the dataframe constructor, because it doesn’t exist in my version of pandas.