"Data-loss while decompressing corrupted data" python threading
Question:
I built a script to covert pdf files to excel and it works well. Now I want to use threading to make it faster but i keep getting this message "Data-loss while decompressing corrupted data" on the console and the output of my "extract" function are some random characters when I set the max_workers above 1
import os
import pdfplumber
import pandas as pd
import glob
import re
import concurrent.futures
def extract(page):
table = page.extract_table(table_settings={})
tab=[]
for rec in table:
temp_rec=[]
for field in rec:
if field is not None:
field = re.sub(r'n| +','', field)
temp_rec.append(field)
if temp_rec != header and len(temp_rec) == 8:
tab.append(temp_rec)
print(tab)
return tab
header = ['Transaction Date', 'Description', 'InstCode', 'Value Date', 'Withdrawls', 'Deposits', 'Balance', '']
pdfs = glob.glob('*.pdf')
for file in pdfs:
with pdfplumber.open(file) as pdf:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(extract, pdf.pages)
Answers:
The pdfplumber
module might be thread-unsafe, therefore, using it in multiple threads at the same time can lead to unexpected results, because of race conditions. In this case, it is a good idea to use a thread-Lock.
import os
import pdfplumber
import pandas as pd
import glob
import re
import concurrent.futures
import threading
lock=threading.Lock()
def extract(page):
lock.acquire()
table = page.extract_table(table_settings={})
if type(table)!=list and type(table)!=tuple:#I dont have any experience with pdfplumber, if this is an iterator, it will be converted to a tuple, because some Iterators might be thread-unsafe
table=tuple(table)
lock.release()
tab=[]
for rec in table:
temp_rec=[]
for field in rec:
if field is not None:
field = re.sub(r'n| +','', field)
temp_rec.append(field)
if temp_rec != header and len(temp_rec) == 8:
tab.append(temp_rec)
print(tab)
return tab
header = ['Transaction Date', 'Description', 'InstCode', 'Value Date', 'Withdrawls', 'Deposits', 'Balance', '']
pdfs = glob.glob('*.pdf')
for file in pdfs:
with pdfplumber.open(file) as pdf:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(extract, pdf.pages)
I built a script to covert pdf files to excel and it works well. Now I want to use threading to make it faster but i keep getting this message "Data-loss while decompressing corrupted data" on the console and the output of my "extract" function are some random characters when I set the max_workers above 1
import os
import pdfplumber
import pandas as pd
import glob
import re
import concurrent.futures
def extract(page):
table = page.extract_table(table_settings={})
tab=[]
for rec in table:
temp_rec=[]
for field in rec:
if field is not None:
field = re.sub(r'n| +','', field)
temp_rec.append(field)
if temp_rec != header and len(temp_rec) == 8:
tab.append(temp_rec)
print(tab)
return tab
header = ['Transaction Date', 'Description', 'InstCode', 'Value Date', 'Withdrawls', 'Deposits', 'Balance', '']
pdfs = glob.glob('*.pdf')
for file in pdfs:
with pdfplumber.open(file) as pdf:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(extract, pdf.pages)
The pdfplumber
module might be thread-unsafe, therefore, using it in multiple threads at the same time can lead to unexpected results, because of race conditions. In this case, it is a good idea to use a thread-Lock.
import os
import pdfplumber
import pandas as pd
import glob
import re
import concurrent.futures
import threading
lock=threading.Lock()
def extract(page):
lock.acquire()
table = page.extract_table(table_settings={})
if type(table)!=list and type(table)!=tuple:#I dont have any experience with pdfplumber, if this is an iterator, it will be converted to a tuple, because some Iterators might be thread-unsafe
table=tuple(table)
lock.release()
tab=[]
for rec in table:
temp_rec=[]
for field in rec:
if field is not None:
field = re.sub(r'n| +','', field)
temp_rec.append(field)
if temp_rec != header and len(temp_rec) == 8:
tab.append(temp_rec)
print(tab)
return tab
header = ['Transaction Date', 'Description', 'InstCode', 'Value Date', 'Withdrawls', 'Deposits', 'Balance', '']
pdfs = glob.glob('*.pdf')
for file in pdfs:
with pdfplumber.open(file) as pdf:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(extract, pdf.pages)