"Data-loss while decompressing corrupted data" python threading

Question:

I built a script to covert pdf files to excel and it works well. Now I want to use threading to make it faster but i keep getting this message "Data-loss while decompressing corrupted data" on the console and the output of my "extract" function are some random characters when I set the max_workers above 1

import os
import pdfplumber 
import pandas as pd
import glob
import re
import concurrent.futures

def extract(page):
    table = page.extract_table(table_settings={})
    tab=[]
    for rec in table:
        temp_rec=[]
        for field in rec:
            if field is not None:
                field = re.sub(r'n|  +','', field)
                temp_rec.append(field)
        if temp_rec != header and len(temp_rec) == 8:
            tab.append(temp_rec)
    print(tab)
    return tab

header = ['Transaction Date', 'Description', 'InstCode', 'Value Date', 'Withdrawls', 'Deposits', 'Balance', '']

pdfs = glob.glob('*.pdf')
for file in pdfs:
    with pdfplumber.open(file) as pdf:
        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            result = executor.map(extract, pdf.pages)
Asked By: Tony

||

Answers:

The pdfplumber module might be thread-unsafe, therefore, using it in multiple threads at the same time can lead to unexpected results, because of race conditions. In this case, it is a good idea to use a thread-Lock.

import os
import pdfplumber 
import pandas as pd
import glob
import re
import concurrent.futures
import threading
lock=threading.Lock()

def extract(page):
    lock.acquire()
    table = page.extract_table(table_settings={})
    if type(table)!=list and type(table)!=tuple:#I dont have any experience with pdfplumber, if this is an iterator, it will be converted to a tuple, because some Iterators might be thread-unsafe
        table=tuple(table)
    lock.release()
    tab=[]
    for rec in table:
        temp_rec=[]
        for field in rec:
            if field is not None:
                field = re.sub(r'n|  +','', field)
                temp_rec.append(field)
        if temp_rec != header and len(temp_rec) == 8:
            tab.append(temp_rec)
    print(tab)
    return tab

header = ['Transaction Date', 'Description', 'InstCode', 'Value Date', 'Withdrawls', 'Deposits', 'Balance', '']

pdfs = glob.glob('*.pdf')
for file in pdfs:
    with pdfplumber.open(file) as pdf:
        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            result = executor.map(extract, pdf.pages)
Answered By: HelpfulHelper
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.