Is there a Python module to handle csv files with sections?
Question:
I’m seeing more and more csv files containing multiple sections, each containing their own table. For instance this file from 10XGenomics:
[gene-expression]
reference,/path/to/transcriptome
[libraries]
fastq_id,fastqs,feature_types
gex1,/path/to/fastqs,Gene Expression
mux1,/path/to/fastqs,Multiplexing Capture
[samples]
sample_id,cmo_ids
sample1,CMO301
sample2,CMO303
Sometimes the section headers are even embedded in their own row, e.g.
[gene-expression],,
reference,/path/to/transcriptome,
[libraries],,
fastq_id,fastqs,feature_types
gex1,/path/to/fastqs,Gene Expression
mux1,/path/to/fastqs,Multiplexing Capture
[samples],,
sample_id,cmo_ids,
sample1,CMO301,
sample2,CMO303,
Is there a Python module to handle this kind of sectioning directly? I couldn’t find how to do it with Pandas or the csv
module. E.g. from both examples above I would expect to get a dictionary with one item per section, and then a list of lists for each section.
Some sections have headers, it would be nice if this could be handled too, e.g. similarly to csv.DictReader
.
Although it’s not particularly hard to write a solution that can parse this particular example, producing something that works in the general case is much harder, e.g. parsing a simple csv file is easily done with split
and yet the csv
module is 400+ lines of Python, and many more lines of C, so what I’m really looking for here is a module to handle this problem in general.
PS: this question is related but the answers do not address the point about the csv parser unfortunately
Answers:
Here is a proposition with pandas to handle both formats :
import pandas as pd
df = (pd.read_fwf("input.txt", header=None, names=["data"])
.assign(section=lambda x: x["data"].str.extract("[(.*)]").ffill())
)
d_dfs = { # type hint: Dict[str, pd.DataFrame]
k: (g.iloc[1:,0].str.split(",", expand=True)
.pipe(lambda df_:
df_.rename(columns=df_.iloc[0])
.drop(df_.index[0])))
for k, g in df.groupby('section')
}
Output :
>>> print(d_dfs["libraries"])
fastq_id fastqs feature_types
4 gex1 /path/to/fastqs Gene Expression
5 mux1 /path/to/fastqs Multiplexing Capture
>>> print(d_dfs["samples"])
sample_id cmo_ids
8 sample1 CMO301
9 sample2 CMO303
You can use configparser
module to read your file:
from configparser import ConfigParser
import io
import pandas as pd
cfg = ConfigParser(allow_no_value=True)
cfg.optionxform = str
cfg.read('data.csv')
dfs = {}
for section in cfg.sections():
buf = io.StringIO()
buf.writelines('n'.join(row.rstrip(',') for row in cfg[section]))
buf.seek(0)
dfs[section] = pd.read_csv(buf)
Output:
>>> dfs['gene-expression']
Empty DataFrame
Columns: [reference, /path/to/transcriptome]
Index: []
>>> dfs['libraries']
fastq_id fastqs feature_types
0 gex1 /path/to/fastqs Gene Expression
1 mux1 /path/to/fastqs Multiplexing Capture
>>> dfs['samples']
sample_id cmo_ids
0 sample1 CMO301
1 sample2 CMO303
Now you can also just want to extract a section:
cfg = ConfigParser(allow_no_value=True)
cfg.optionxform = str
cfg.read('data.csv')
def read_data(section):
buf = io.StringIO()
buf.writelines('n'.join(row.rstrip(',') for row in cfg[section]))
buf.seek(0)
return pd.read_csv(buf)
df = read_data('samples')
Output:
>>> df
sample_id cmo_ids
0 sample1 CMO301
1 sample2 CMO303
It is fairly easy to use itertools.groupby()
to handle parsing files with sections like this for the csv
standard library module:
import csv
import io
import itertools
s = """
[gene-expression]
reference,/path/to/transcriptome
[libraries]
fastq_id,fastqs,feature_types
gex1,/path/to/fastqs,Gene Expression
mux1,/path/to/fastqs,Multiplexing Capture
[samples]
sample_id,cmo_ids
sample1,CMO301
sample2,CMO303
"""
def is_header(l):
return l.strip().startswith("[") and l.strip().endswith("]")
f = io.StringIO(s)
grouped = itertools.groupby(f, is_header)
try:
while True:
_, header = next(grouped)
header = list(csv.reader(header))[-1][0]
_, section = next(grouped)
section = list(csv.reader(section))
print(header)
print(section)
except StopIteration:
pass
If you have Python 3.10 or higher, you can combine itertools.groupby()
with itertools.pairwise()
which would make this even simpler:
s = """
[gene-expression]
reference,/path/to/transcriptome
[libraries]
fastq_id,fastqs,feature_types
gex1,/path/to/fastqs,Gene Expression
mux1,/path/to/fastqs,Multiplexing Capture
[samples]
sample_id,cmo_ids
sample1,CMO301
sample2,CMO303
"""
import csv
import io
import itertools
f = io.StringIO(s)
def is_header(l):
return l.strip().startswith("[") and l.strip().endswith("]")
grouped = itertools.groupby(f, is_header)
paired = itertools.pairwise(list(g) for k, g in grouped)
data = {header[-1].strip("[]n"): list(csv.reader(section)) for header, section in paired}
print(data)
I’m seeing more and more csv files containing multiple sections, each containing their own table. For instance this file from 10XGenomics:
[gene-expression]
reference,/path/to/transcriptome
[libraries]
fastq_id,fastqs,feature_types
gex1,/path/to/fastqs,Gene Expression
mux1,/path/to/fastqs,Multiplexing Capture
[samples]
sample_id,cmo_ids
sample1,CMO301
sample2,CMO303
Sometimes the section headers are even embedded in their own row, e.g.
[gene-expression],,
reference,/path/to/transcriptome,
[libraries],,
fastq_id,fastqs,feature_types
gex1,/path/to/fastqs,Gene Expression
mux1,/path/to/fastqs,Multiplexing Capture
[samples],,
sample_id,cmo_ids,
sample1,CMO301,
sample2,CMO303,
Is there a Python module to handle this kind of sectioning directly? I couldn’t find how to do it with Pandas or the csv
module. E.g. from both examples above I would expect to get a dictionary with one item per section, and then a list of lists for each section.
Some sections have headers, it would be nice if this could be handled too, e.g. similarly to csv.DictReader
.
Although it’s not particularly hard to write a solution that can parse this particular example, producing something that works in the general case is much harder, e.g. parsing a simple csv file is easily done with split
and yet the csv
module is 400+ lines of Python, and many more lines of C, so what I’m really looking for here is a module to handle this problem in general.
PS: this question is related but the answers do not address the point about the csv parser unfortunately
Here is a proposition with pandas to handle both formats :
import pandas as pd
df = (pd.read_fwf("input.txt", header=None, names=["data"])
.assign(section=lambda x: x["data"].str.extract("[(.*)]").ffill())
)
d_dfs = { # type hint: Dict[str, pd.DataFrame]
k: (g.iloc[1:,0].str.split(",", expand=True)
.pipe(lambda df_:
df_.rename(columns=df_.iloc[0])
.drop(df_.index[0])))
for k, g in df.groupby('section')
}
Output :
>>> print(d_dfs["libraries"])
fastq_id fastqs feature_types
4 gex1 /path/to/fastqs Gene Expression
5 mux1 /path/to/fastqs Multiplexing Capture
>>> print(d_dfs["samples"])
sample_id cmo_ids
8 sample1 CMO301
9 sample2 CMO303
You can use configparser
module to read your file:
from configparser import ConfigParser
import io
import pandas as pd
cfg = ConfigParser(allow_no_value=True)
cfg.optionxform = str
cfg.read('data.csv')
dfs = {}
for section in cfg.sections():
buf = io.StringIO()
buf.writelines('n'.join(row.rstrip(',') for row in cfg[section]))
buf.seek(0)
dfs[section] = pd.read_csv(buf)
Output:
>>> dfs['gene-expression']
Empty DataFrame
Columns: [reference, /path/to/transcriptome]
Index: []
>>> dfs['libraries']
fastq_id fastqs feature_types
0 gex1 /path/to/fastqs Gene Expression
1 mux1 /path/to/fastqs Multiplexing Capture
>>> dfs['samples']
sample_id cmo_ids
0 sample1 CMO301
1 sample2 CMO303
Now you can also just want to extract a section:
cfg = ConfigParser(allow_no_value=True)
cfg.optionxform = str
cfg.read('data.csv')
def read_data(section):
buf = io.StringIO()
buf.writelines('n'.join(row.rstrip(',') for row in cfg[section]))
buf.seek(0)
return pd.read_csv(buf)
df = read_data('samples')
Output:
>>> df
sample_id cmo_ids
0 sample1 CMO301
1 sample2 CMO303
It is fairly easy to use itertools.groupby()
to handle parsing files with sections like this for the csv
standard library module:
import csv
import io
import itertools
s = """
[gene-expression]
reference,/path/to/transcriptome
[libraries]
fastq_id,fastqs,feature_types
gex1,/path/to/fastqs,Gene Expression
mux1,/path/to/fastqs,Multiplexing Capture
[samples]
sample_id,cmo_ids
sample1,CMO301
sample2,CMO303
"""
def is_header(l):
return l.strip().startswith("[") and l.strip().endswith("]")
f = io.StringIO(s)
grouped = itertools.groupby(f, is_header)
try:
while True:
_, header = next(grouped)
header = list(csv.reader(header))[-1][0]
_, section = next(grouped)
section = list(csv.reader(section))
print(header)
print(section)
except StopIteration:
pass
If you have Python 3.10 or higher, you can combine itertools.groupby()
with itertools.pairwise()
which would make this even simpler:
s = """
[gene-expression]
reference,/path/to/transcriptome
[libraries]
fastq_id,fastqs,feature_types
gex1,/path/to/fastqs,Gene Expression
mux1,/path/to/fastqs,Multiplexing Capture
[samples]
sample_id,cmo_ids
sample1,CMO301
sample2,CMO303
"""
import csv
import io
import itertools
f = io.StringIO(s)
def is_header(l):
return l.strip().startswith("[") and l.strip().endswith("]")
grouped = itertools.groupby(f, is_header)
paired = itertools.pairwise(list(g) for k, g in grouped)
data = {header[-1].strip("[]n"): list(csv.reader(section)) for header, section in paired}
print(data)