How to read a csv with a broken header in pandas?
Question:
I have a problem with opening CSV file using Pandas in Jupyter and then I tried to open it in Visual studio and it’s also not working. What am I missing?
Code in Jupyter:
path = 'data/DATA_vozila_RAW.csv'
df = pd.read_csv(path)
I also tried adding encoding="Latin-1" to the code.
Code in Visual studio:
data = pd.read_csv('DATA_vozila_RAW.csv', encoding="Latin-1", delimiter=",")
print(data)
Error:
ParserError Traceback (most recent call last)
Cell In[54], line 1
—-> 1 df = pd.read_csv(path)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasutil_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasutil_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasioparsersreaders.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasioparsersreaders.py:611, in _read(filepath_or_buffer, kwds)
608 return parser
610 with parser:
--> 611 return parser.read(nrows)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasioparsersreaders.py:1778, in TextFileReader.read(self, nrows)
1771 nrows = validate_integer("nrows", nrows)
1772 try:
1773 # error: "ParserBase" has no attribute "read"
1774 (
1775 index,
1776 columns,
1777 col_dict,
-> 1778 ) = self._engine.read( # type: ignore[attr-defined]
1779 nrows
1780 )
1781 except Exception:
1782 self.close()
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasioparsersc_parser_wrapper.py:230, in CParserWrapper.read(self, nrows)
228 try:
229 if self.low_memory:
--> 230 chunks = self._reader.read_low_memory(nrows)
231 # destructive to chunks
232 data = _concatenate_chunks(chunks)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandas_libsparsers.pyx:808, in pandas._libs.parsers.TextReader.read_low_memory()
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandas_libsparsers.pyx:866, in pandas._libs.parsers.TextReader._read_rows()
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandas_libsparsers.pyx:852, in pandas._libs.parsers.TextReader._tokenize_rows()
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandas_libsparsers.pyx:1973, in pandas._libs.parsers.raise_parser_error()
ParserError: Error tokenizing data. C error: Expected 1 fields in line 3, saw 2
Answers:
The header of your csv file seems to be broken/split in two separate lines.
If so, you can start by making a list of the header names and pass it to names
in read_csv
:
with open("DATA_vozila_RAW.csv", "r") as csv_file:
headers = " ".join([line.strip() for line in csv_file.readlines()[:2]]).split(";")
data = pd.read_csv("DATA_vozila_RAW.csv", encoding="Latin-1", delimiter=";",
skiprows=2, header=None, names=headers).iloc[:, 2:]
Output (in Jupyter) :
I have a problem with opening CSV file using Pandas in Jupyter and then I tried to open it in Visual studio and it’s also not working. What am I missing?
Code in Jupyter:
path = 'data/DATA_vozila_RAW.csv'
df = pd.read_csv(path)
I also tried adding encoding="Latin-1" to the code.
Code in Visual studio:
data = pd.read_csv('DATA_vozila_RAW.csv', encoding="Latin-1", delimiter=",")
print(data)
Error:
ParserError Traceback (most recent call last)
Cell In[54], line 1
—-> 1 df = pd.read_csv(path)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasutil_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasutil_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasioparsersreaders.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasioparsersreaders.py:611, in _read(filepath_or_buffer, kwds)
608 return parser
610 with parser:
--> 611 return parser.read(nrows)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasioparsersreaders.py:1778, in TextFileReader.read(self, nrows)
1771 nrows = validate_integer("nrows", nrows)
1772 try:
1773 # error: "ParserBase" has no attribute "read"
1774 (
1775 index,
1776 columns,
1777 col_dict,
-> 1778 ) = self._engine.read( # type: ignore[attr-defined]
1779 nrows
1780 )
1781 except Exception:
1782 self.close()
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandasioparsersc_parser_wrapper.py:230, in CParserWrapper.read(self, nrows)
228 try:
229 if self.low_memory:
--> 230 chunks = self._reader.read_low_memory(nrows)
231 # destructive to chunks
232 data = _concatenate_chunks(chunks)
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandas_libsparsers.pyx:808, in pandas._libs.parsers.TextReader.read_low_memory()
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandas_libsparsers.pyx:866, in pandas._libs.parsers.TextReader._read_rows()
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandas_libsparsers.pyx:852, in pandas._libs.parsers.TextReader._tokenize_rows()
File ~Desktopanalitika_podatkovwork_dirpython-analitika-public.venvlibsite-packagespandas_libsparsers.pyx:1973, in pandas._libs.parsers.raise_parser_error()
ParserError: Error tokenizing data. C error: Expected 1 fields in line 3, saw 2
The header of your csv file seems to be broken/split in two separate lines.
If so, you can start by making a list of the header names and pass it to names
in read_csv
:
with open("DATA_vozila_RAW.csv", "r") as csv_file:
headers = " ".join([line.strip() for line in csv_file.readlines()[:2]]).split(";")
data = pd.read_csv("DATA_vozila_RAW.csv", encoding="Latin-1", delimiter=";",
skiprows=2, header=None, names=headers).iloc[:, 2:]
Output (in Jupyter) :