Python DictWriter writing UTF-8 encoded CSV files
Question:
- I have a list of dictionaries containing unicode strings.
csv.DictWriter
can write a list of dictionaries into a CSV file.
- I want the CSV file to be encoded in UTF8.
- The
csv
module cannot handle converting unicode strings into UTF8.
-
The csv
module documentation has an example for converting everything to UTF8:
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
-
It also has a UnicodeWriter
class.
But… how do I make DictWriter
work with these? Wouldn’t they have to inject themselves in the middle of it, to catch the disassembled dictionaries and encode them before it writes them to the file? I don’t get it.
Answers:
When you call csv.writer
with your content, the idea is to pass the content through utf_8_encoder
as it would give you the (utf-8) encoded content.
You can use some proxy class to encode dict values as needed, like this:
# -*- coding: utf-8 -*-
import csv
d = {'a':123,'b':456, 'c':u'Non-ASCII: проверка'}
class DictUnicodeProxy(object):
def __init__(self, d):
self.d = d
def __iter__(self):
return self.d.__iter__()
def get(self, item, default=None):
i = self.d.get(item, default)
if isinstance(i, unicode):
return i.encode('utf-8')
return i
with open('some.csv', 'wb') as f:
writer = csv.DictWriter(f, ['a', 'b', 'c'])
writer.writerow(DictUnicodeProxy(d))
You can convert the values to UTF-8 on the fly as you pass the dict to DictWriter.writerow()
. For example:
import csv
rows = [
{'name': u'Antonxedn Dvou0159xe1k','country': u'u010cesko'},
{'name': u'Bjxf6rk Guxf0mundsdxf3ttir', 'country': u'xcdsland'},
{'name': u'Sxf8ren Kierkegxe5rd', 'country': u'Danmark'}
]
# implement this wrapper on 2.6 or lower if you need to output a header
class DictWriterEx(csv.DictWriter):
def writeheader(self):
header = dict(zip(self.fieldnames, self.fieldnames))
self.writerow(header)
out = open('foo.csv', 'wb')
writer = DictWriterEx(out, fieldnames=['name','country'])
# DictWriter.writeheader() was added in 2.7 (use class above for <= 2.6)
writer.writeheader()
for row in rows:
writer.writerow(dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
out.close()
Output foo.csv:
name,country
Antonín Dvořák,Česko
Björk Guðmundsdóttir,Ísland
Søren Kierkegård,Danmark
UPDATE: The 3rd party unicodecsv module implements this 7-year old answer for you. Example below this code. There’s also a Python 3 solution that doesn’t required a 3rd party module.
Original Python 2 Answer
If using Python 2.7 or later, use a dict comprehension to remap the dictionary to utf-8 before passing to DictWriter:
# coding: utf-8
import csv
D = {'name':u'马克','pinyin':u'mǎkè'}
f = open('out.csv','wb')
f.write(u'ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = csv.DictWriter(f,sorted(D.keys()))
w.writeheader()
w.writerow({k:v.encode('utf8') for k,v in D.items()})
f.close()
You can use this idea to update UnicodeWriter to DictUnicodeWriter:
# coding: utf-8
import csv
import cStringIO
import codecs
class DictUnicodeWriter(object):
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, D):
self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for D in rows:
self.writerow(D)
def writeheader(self):
self.writer.writeheader()
D1 = {'name':u'马克','pinyin':u'Mǎkè'}
D2 = {'name':u'美国','pinyin':u'Měiguó'}
f = open('out.csv','wb')
f.write(u'ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = DictUnicodeWriter(f,sorted(D.keys()))
w.writeheader()
w.writerows([D1,D2])
f.close()
Python 2 unicodecsv Example:
# coding: utf-8
import unicodecsv as csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
with open('out.csv','wb') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()),encoding='utf-8-sig')
w.writeheader()
w.writerow(D)
Python 3:
Additionally, Python 3’s built-in csv module supports Unicode natively:
# coding: utf-8
import csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
# Use newline='' instead of 'wb' in Python 3.
with open('out.csv','w',encoding='utf-8-sig',newline='') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()))
w.writeheader()
w.writerow(D)
My solution is a bit different. While all solutions above are focusing on having unicode compatible dict, my solutions makes DictWriter compatible with unicode. This approach is even suggested in python docs (1).
Classes UTF8Recoder, UnicodeReader, UnicodeWriter are taken from python docs. UnicodeWriter->writerow was changed a little bit too.
Use it as regular DictWriter/DictReader.
Here is the code:
import csv, codecs, cStringIO
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class UnicodeDictWriter(csv.DictWriter, object):
def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds):
super(UnicodeDictWriter, self).__init__(f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds)
self.writer = UnicodeWriter(f, dialect, **kwds)
There is a simple workaround using the wonderful UnicodeCSV module. After having it, just change the line
import csv
to
import unicodecsv as csv
And it automagically begins playing nice with UTF-8.
Note: Switching to Python 3 will also rid you of this problem (thanks jamescampbell for the tip). And it’s something one should do anyway.
- I have a list of dictionaries containing unicode strings.
csv.DictWriter
can write a list of dictionaries into a CSV file.- I want the CSV file to be encoded in UTF8.
- The
csv
module cannot handle converting unicode strings into UTF8. -
The
csv
module documentation has an example for converting everything to UTF8:def utf_8_encoder(unicode_csv_data): for line in unicode_csv_data: yield line.encode('utf-8')
-
It also has a
UnicodeWriter
class.
But… how do I make DictWriter
work with these? Wouldn’t they have to inject themselves in the middle of it, to catch the disassembled dictionaries and encode them before it writes them to the file? I don’t get it.
When you call csv.writer
with your content, the idea is to pass the content through utf_8_encoder
as it would give you the (utf-8) encoded content.
You can use some proxy class to encode dict values as needed, like this:
# -*- coding: utf-8 -*-
import csv
d = {'a':123,'b':456, 'c':u'Non-ASCII: проверка'}
class DictUnicodeProxy(object):
def __init__(self, d):
self.d = d
def __iter__(self):
return self.d.__iter__()
def get(self, item, default=None):
i = self.d.get(item, default)
if isinstance(i, unicode):
return i.encode('utf-8')
return i
with open('some.csv', 'wb') as f:
writer = csv.DictWriter(f, ['a', 'b', 'c'])
writer.writerow(DictUnicodeProxy(d))
You can convert the values to UTF-8 on the fly as you pass the dict to DictWriter.writerow()
. For example:
import csv
rows = [
{'name': u'Antonxedn Dvou0159xe1k','country': u'u010cesko'},
{'name': u'Bjxf6rk Guxf0mundsdxf3ttir', 'country': u'xcdsland'},
{'name': u'Sxf8ren Kierkegxe5rd', 'country': u'Danmark'}
]
# implement this wrapper on 2.6 or lower if you need to output a header
class DictWriterEx(csv.DictWriter):
def writeheader(self):
header = dict(zip(self.fieldnames, self.fieldnames))
self.writerow(header)
out = open('foo.csv', 'wb')
writer = DictWriterEx(out, fieldnames=['name','country'])
# DictWriter.writeheader() was added in 2.7 (use class above for <= 2.6)
writer.writeheader()
for row in rows:
writer.writerow(dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
out.close()
Output foo.csv:
name,country
Antonín Dvořák,Česko
Björk Guðmundsdóttir,Ísland
Søren Kierkegård,Danmark
UPDATE: The 3rd party unicodecsv module implements this 7-year old answer for you. Example below this code. There’s also a Python 3 solution that doesn’t required a 3rd party module.
Original Python 2 Answer
If using Python 2.7 or later, use a dict comprehension to remap the dictionary to utf-8 before passing to DictWriter:
# coding: utf-8
import csv
D = {'name':u'马克','pinyin':u'mǎkè'}
f = open('out.csv','wb')
f.write(u'ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = csv.DictWriter(f,sorted(D.keys()))
w.writeheader()
w.writerow({k:v.encode('utf8') for k,v in D.items()})
f.close()
You can use this idea to update UnicodeWriter to DictUnicodeWriter:
# coding: utf-8
import csv
import cStringIO
import codecs
class DictUnicodeWriter(object):
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, D):
self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for D in rows:
self.writerow(D)
def writeheader(self):
self.writer.writeheader()
D1 = {'name':u'马克','pinyin':u'Mǎkè'}
D2 = {'name':u'美国','pinyin':u'Měiguó'}
f = open('out.csv','wb')
f.write(u'ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = DictUnicodeWriter(f,sorted(D.keys()))
w.writeheader()
w.writerows([D1,D2])
f.close()
Python 2 unicodecsv Example:
# coding: utf-8
import unicodecsv as csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
with open('out.csv','wb') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()),encoding='utf-8-sig')
w.writeheader()
w.writerow(D)
Python 3:
Additionally, Python 3’s built-in csv module supports Unicode natively:
# coding: utf-8
import csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
# Use newline='' instead of 'wb' in Python 3.
with open('out.csv','w',encoding='utf-8-sig',newline='') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()))
w.writeheader()
w.writerow(D)
My solution is a bit different. While all solutions above are focusing on having unicode compatible dict, my solutions makes DictWriter compatible with unicode. This approach is even suggested in python docs (1).
Classes UTF8Recoder, UnicodeReader, UnicodeWriter are taken from python docs. UnicodeWriter->writerow was changed a little bit too.
Use it as regular DictWriter/DictReader.
Here is the code:
import csv, codecs, cStringIO
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class UnicodeDictWriter(csv.DictWriter, object):
def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds):
super(UnicodeDictWriter, self).__init__(f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds)
self.writer = UnicodeWriter(f, dialect, **kwds)
There is a simple workaround using the wonderful UnicodeCSV module. After having it, just change the line
import csv
to
import unicodecsv as csv
And it automagically begins playing nice with UTF-8.
Note: Switching to Python 3 will also rid you of this problem (thanks jamescampbell for the tip). And it’s something one should do anyway.