python – UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 131198: character maps to <undefined>
Question:
I tried to use the following code to decompress and read the dataset……………………………………………………………….
# Import the dataset
!wget https://research.aalto.fi/files/16859732/urlset.csv.zip
# import the necessary libraries
import zipfile
import pandas as pd
import numpy as np
import io
path_to_zip_file = "/content/urlset.csv.zip"
directory_to_extract_to= "/content/"
data_url = '/content/urlset.csv'
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
zip_ref.extractall(directory_to_extract_to)
with io.open(data_url, 'r', encoding='windows-1252') as f:
data = pd.read_csv(f)
However, I get the error below:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-13-4360b73cdc0d> in <module>
1 with io.open(data_url, 'r', encoding='windows-1252') as f:
----> 2 data = pd.read_csv(f)
8 frames
/usr/local/lib/python3.8/dist-packages/pandas/io/parsers/c_parser_wrapper.py in read(self, nrows)
222 try:
223 if self.low_memory:
--> 224 chunks = self._reader.read_low_memory(nrows)
225 # destructive to chunks
226 data = _concatenate_chunks(chunks)
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.read_low_memory()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_rows()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.raise_parser_error()
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 131198: character maps to <undefined>
What is the cause of the error?
Answers:
Looks like the CSV file is broken in line 18232 and 18233. Open it in a texteditor like mousepad, gedit or load it with libreoffice calc.
Maybe you could ignore these 2 lines of your dataset? The code below will fix the file.
encodingError = 0
lineCounter = 0
with open('urlset.csv', 'r', newline='n', encoding='utf-8') as inFile, open('output.csv', 'w') as outFile:
while True:
lineCounter += 1
try:
line = inFile.readline()
except UnicodeDecodeError as e:
encodingError += 1
print(f'encoding error in line {lineCounter}, error message: {e}')
continue
if not line:
break
outFile.write(line)
print(f'total skipped lines {encodingError}, due to encoding error.')
output of the code:
encoding error in line 18232, error message: 'utf-8' codec can't decode byte 0xad in position 122: invalid start byte
encoding error in line 18233, error message: 'utf-8' codec can't decode byte 0xd5 in position 2: invalid continuation byte
total skipped lines 2, due to encoding error.
I tried to use the following code to decompress and read the dataset……………………………………………………………….
# Import the dataset
!wget https://research.aalto.fi/files/16859732/urlset.csv.zip
# import the necessary libraries
import zipfile
import pandas as pd
import numpy as np
import io
path_to_zip_file = "/content/urlset.csv.zip"
directory_to_extract_to= "/content/"
data_url = '/content/urlset.csv'
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
zip_ref.extractall(directory_to_extract_to)
with io.open(data_url, 'r', encoding='windows-1252') as f:
data = pd.read_csv(f)
However, I get the error below:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-13-4360b73cdc0d> in <module>
1 with io.open(data_url, 'r', encoding='windows-1252') as f:
----> 2 data = pd.read_csv(f)
8 frames
/usr/local/lib/python3.8/dist-packages/pandas/io/parsers/c_parser_wrapper.py in read(self, nrows)
222 try:
223 if self.low_memory:
--> 224 chunks = self._reader.read_low_memory(nrows)
225 # destructive to chunks
226 data = _concatenate_chunks(chunks)
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.read_low_memory()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_rows()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.raise_parser_error()
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 131198: character maps to <undefined>
What is the cause of the error?
Looks like the CSV file is broken in line 18232 and 18233. Open it in a texteditor like mousepad, gedit or load it with libreoffice calc.
Maybe you could ignore these 2 lines of your dataset? The code below will fix the file.
encodingError = 0
lineCounter = 0
with open('urlset.csv', 'r', newline='n', encoding='utf-8') as inFile, open('output.csv', 'w') as outFile:
while True:
lineCounter += 1
try:
line = inFile.readline()
except UnicodeDecodeError as e:
encodingError += 1
print(f'encoding error in line {lineCounter}, error message: {e}')
continue
if not line:
break
outFile.write(line)
print(f'total skipped lines {encodingError}, due to encoding error.')
output of the code:
encoding error in line 18232, error message: 'utf-8' codec can't decode byte 0xad in position 122: invalid start byte
encoding error in line 18233, error message: 'utf-8' codec can't decode byte 0xd5 in position 2: invalid continuation byte
total skipped lines 2, due to encoding error.