Python: Extract text from multiple pdf and paste on excel

Question

i’m a total new in python, could you help me correct this code?

I would like to add 2 things:

do the operation on multiple pdf and not just one and pasting the content in A2,A3 A4 and so on
if possible writing in the another row (B2,B3,B4) the name of the pdf file.

Thank you in advance, this is the code i’m working with

import PyPDF2
import openpyxl
pdfFileObj = open("file.pdf", 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages

pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext

wb.save('excel.xlsx')
print('DONE!!')

I’ve modified the code as suggested and the cycle seems to get all the pages! but maybe i have to work with "sheet[f’A{row}’].value = ‘n’.join(output)" because it seems to print a lot of spaces


import PyPDF2
import openpyxl
import os
import glob
root_dir = "your directory"

filenames = []
# root_dir needs a trailing slash (i.e. /root/dir/)
for filename in glob.iglob(root_dir + '**/**', recursive=True):
    if filename.lower().endswith('.pdf'):
        filenames.append(os.path.join(directory, filename))
        

wb = openpyxl.load_workbook('excel.xlsx')#your file excel
sheet = wb.active
sheet.title = 'MyPDF'

for row, filename in enumerate(filenames, start=1):
    with open(filename, 'rb') as f:
        pdfReader = PyPDF2.PdfFileReader(f)
        count=pdfReader.numPages
        pageObj = pdfReader.getPage(0)
        mytext = pageObj.extractText()
        for i in range(count): 
            page = pdfReader.getPage(i)
            output = []
            output = page.extractText() 
            print(output)

    sheet[f'A{row}'].value = 'n'.join(output)
    sheet[f'B{row}'].value = filename

wb.save('excel.xlsx') #your file excel
print('DONE!!')

Asked By: Gabry

||

Source

Answer 1

You basically want to put the code you wrote which reads the pdf file into a for loop which iterates over the filenames (in this case, the filenames are stored as a tuple).

Using enumerate, row increments every iteration of the loop, and starts at 1. So the text and filename will be put into A1 and B1, then A2 and B2, and so on.

import PyPDF2
import openpyxl

filenames = ("file.pdf", 
             "file2.pdf", 
             "file3.pdf", 
            )

wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'

for row, filename in enumerate(filenames, start=1):
    with open(filename, 'rb') as f:
        pdfReader = PyPDF2.PdfFileReader(f)
        pdfReader.numPages
        pageObj = pdfReader.getPage(0)
        mytext = pageObj.extractText()
    
    sheet[f'A{row}'].value = mytext
    sheet[f'B{row}'].value = filename

wb.save('excel.xlsx')
print('DONE!!')

You can get a list of all the filenames ending in .pdf quite easily by iterating over all the files in a directory, and checking if the filename ends in .pdf. If it does, use os.path.join to give you the full filepath, and append it to the filenames list.

You could also use the glob module, too.

import os

filenames = []
directory = r"C:StuffPDF Files"
for filename in os.listdir(directory):
    if filename.lower().endswith(".pdf"):
        filenames.append(os.path.join(directory, filename))

Updated code:

import PyPDF2
import openpyxl
import os
import glob
import re
import itertools

# Used to strip characters that can't be written to a spreadsheet
# See https://stackoverflow.com/a/93029/3589122
control_chars = ''.join(map(chr, itertools.chain(range(0x00,0x20), range(0x7f,0xa0))))
control_char_re = re.compile('[%s]' % re.escape(control_chars))

def remove_control_chars(s):
    return control_char_re.sub('', s)

root_dir = 'your directory' # root_dir needs a trailing slash (i.e. /root/dir/)

filenames = (filename for filename in glob.iglob(root_dir + '/**/*.pdf', recursive=True))

wb = openpyxl.load_workbook('excel.xlsx') # your file excel
sheet = wb.active
sheet.title = 'MyPDF'

row = 1
for filename in filenames:
    with open(filename, 'rb') as f:
        try:
            pdfReader = PyPDF2.PdfFileReader(f)
            count = pdfReader.numPages
            
            output = []
            for i in range(count): 
                print(i, filename)
                page = pdfReader.getPage(i)
                output.append(page.extractText())
                #print(output)
        except Exception as e:
            print(f'Error: PyPDF2 could not read {filename}. Continuing... ({e})')
            continue
    
    sheet[f'A{row}'].value = 'n'.join(remove_control_chars(output))
    sheet[f'B{row}'].value = filename
    row += 1

wb.save('excel.xlsx') #your file excel
print('DONE!!')

Answered By: GordonAitchJay

Answer 2

have you tried with more than 6/7 files? i get this error with 7 pdf


TypeError                                 Traceback (most recent call last)
<ipython-input-14-07fb0aa603b8> in <module>
     23         for i in range(count):
     24             page = pdfReader.getPage(i)
---> 25             output.append(page.extractText())
     26             print(output)
     27 

~anaconda3libsite-packagesPyPDF2_page.py in extractText(self, Tj_sep, TJ_sep)
   1283         """
   1284         deprecate_with_replacement("extractText", "extract_text")
-> 1285         return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)
   1286 
   1287     mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())

~anaconda3libsite-packagesPyPDF2_page.py in extract_text(self, Tj_sep, TJ_sep, space_width)
   1261         :return: a string object.
   1262         """
-> 1263         return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
   1264 
   1265     def extract_xform_text(

~anaconda3libsite-packagesPyPDF2_page.py in _extract_text(self, obj, pdf, space_width, content_key)
   1243                     text = ""
   1244             else:
-> 1245                 process_operation(operator, operands)
   1246         output += text  # just in case of
   1247         return output

~anaconda3libsite-packagesPyPDF2_page.py in process_operation(operator, operands)
   1195                 tm_matrix[5] -= TL
   1196             elif operator == b"Tj":
-> 1197                 text += operands[0].translate(cmap)
   1198             else:
   1199                 return None

TypeError: a bytes-like object is required, not 'dict'

Answered By: Gabry

Python: Extract text from multiple pdf and paste on excel

Question:

Answers: