Python: Extract text from multiple pdf and paste on excel
Question:
i’m a total new in python, could you help me correct this code?
I would like to add 2 things:
- do the operation on multiple pdf and not just one and pasting the content in A2,A3 A4 and so on
- if possible writing in the another row (B2,B3,B4) the name of the pdf file.
Thank you in advance, this is the code i’m working with
import PyPDF2
import openpyxl
pdfFileObj = open("file.pdf", 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext
wb.save('excel.xlsx')
print('DONE!!')
I’ve modified the code as suggested and the cycle seems to get all the pages! but maybe i have to work with "sheet[f’A{row}’].value = ‘n’.join(output)" because it seems to print a lot of spaces
import PyPDF2
import openpyxl
import os
import glob
root_dir = "your directory"
filenames = []
# root_dir needs a trailing slash (i.e. /root/dir/)
for filename in glob.iglob(root_dir + '**/**', recursive=True):
if filename.lower().endswith('.pdf'):
filenames.append(os.path.join(directory, filename))
wb = openpyxl.load_workbook('excel.xlsx')#your file excel
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
count=pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
for i in range(count):
page = pdfReader.getPage(i)
output = []
output = page.extractText()
print(output)
sheet[f'A{row}'].value = 'n'.join(output)
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx') #your file excel
print('DONE!!')
Answers:
You basically want to put the code you wrote which reads the pdf file into a for
loop which iterates over the filenames (in this case, the filenames are stored as a tuple
).
Using enumerate
, row
increments every iteration of the loop, and starts at 1. So the text and filename will be put into A1 and B1, then A2 and B2, and so on.
import PyPDF2
import openpyxl
filenames = ("file.pdf",
"file2.pdf",
"file3.pdf",
)
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
sheet[f'A{row}'].value = mytext
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx')
print('DONE!!')
You can get a list of all the filenames ending in .pdf
quite easily by iterating over all the files in a directory, and checking if the filename ends in .pdf
. If it does, use os.path.join
to give you the full filepath, and append it to the filenames
list.
You could also use the glob
module, too.
import os
filenames = []
directory = r"C:StuffPDF Files"
for filename in os.listdir(directory):
if filename.lower().endswith(".pdf"):
filenames.append(os.path.join(directory, filename))
Updated code:
import PyPDF2
import openpyxl
import os
import glob
import re
import itertools
# Used to strip characters that can't be written to a spreadsheet
# See https://stackoverflow.com/a/93029/3589122
control_chars = ''.join(map(chr, itertools.chain(range(0x00,0x20), range(0x7f,0xa0))))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
def remove_control_chars(s):
return control_char_re.sub('', s)
root_dir = 'your directory' # root_dir needs a trailing slash (i.e. /root/dir/)
filenames = (filename for filename in glob.iglob(root_dir + '/**/*.pdf', recursive=True))
wb = openpyxl.load_workbook('excel.xlsx') # your file excel
sheet = wb.active
sheet.title = 'MyPDF'
row = 1
for filename in filenames:
with open(filename, 'rb') as f:
try:
pdfReader = PyPDF2.PdfFileReader(f)
count = pdfReader.numPages
output = []
for i in range(count):
print(i, filename)
page = pdfReader.getPage(i)
output.append(page.extractText())
#print(output)
except Exception as e:
print(f'Error: PyPDF2 could not read {filename}. Continuing... ({e})')
continue
sheet[f'A{row}'].value = 'n'.join(remove_control_chars(output))
sheet[f'B{row}'].value = filename
row += 1
wb.save('excel.xlsx') #your file excel
print('DONE!!')
have you tried with more than 6/7 files? i get this error with 7 pdf
TypeError Traceback (most recent call last)
<ipython-input-14-07fb0aa603b8> in <module>
23 for i in range(count):
24 page = pdfReader.getPage(i)
---> 25 output.append(page.extractText())
26 print(output)
27
~anaconda3libsite-packagesPyPDF2_page.py in extractText(self, Tj_sep, TJ_sep)
1283 """
1284 deprecate_with_replacement("extractText", "extract_text")
-> 1285 return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)
1286
1287 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
~anaconda3libsite-packagesPyPDF2_page.py in extract_text(self, Tj_sep, TJ_sep, space_width)
1261 :return: a string object.
1262 """
-> 1263 return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
1264
1265 def extract_xform_text(
~anaconda3libsite-packagesPyPDF2_page.py in _extract_text(self, obj, pdf, space_width, content_key)
1243 text = ""
1244 else:
-> 1245 process_operation(operator, operands)
1246 output += text # just in case of
1247 return output
~anaconda3libsite-packagesPyPDF2_page.py in process_operation(operator, operands)
1195 tm_matrix[5] -= TL
1196 elif operator == b"Tj":
-> 1197 text += operands[0].translate(cmap)
1198 else:
1199 return None
TypeError: a bytes-like object is required, not 'dict'
i’m a total new in python, could you help me correct this code?
I would like to add 2 things:
- do the operation on multiple pdf and not just one and pasting the content in A2,A3 A4 and so on
- if possible writing in the another row (B2,B3,B4) the name of the pdf file.
Thank you in advance, this is the code i’m working with
import PyPDF2
import openpyxl
pdfFileObj = open("file.pdf", 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext
wb.save('excel.xlsx')
print('DONE!!')
I’ve modified the code as suggested and the cycle seems to get all the pages! but maybe i have to work with "sheet[f’A{row}’].value = ‘n’.join(output)" because it seems to print a lot of spaces
import PyPDF2
import openpyxl
import os
import glob
root_dir = "your directory"
filenames = []
# root_dir needs a trailing slash (i.e. /root/dir/)
for filename in glob.iglob(root_dir + '**/**', recursive=True):
if filename.lower().endswith('.pdf'):
filenames.append(os.path.join(directory, filename))
wb = openpyxl.load_workbook('excel.xlsx')#your file excel
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
count=pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
for i in range(count):
page = pdfReader.getPage(i)
output = []
output = page.extractText()
print(output)
sheet[f'A{row}'].value = 'n'.join(output)
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx') #your file excel
print('DONE!!')
You basically want to put the code you wrote which reads the pdf file into a for
loop which iterates over the filenames (in this case, the filenames are stored as a tuple
).
Using enumerate
, row
increments every iteration of the loop, and starts at 1. So the text and filename will be put into A1 and B1, then A2 and B2, and so on.
import PyPDF2
import openpyxl
filenames = ("file.pdf",
"file2.pdf",
"file3.pdf",
)
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
sheet[f'A{row}'].value = mytext
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx')
print('DONE!!')
You can get a list of all the filenames ending in .pdf
quite easily by iterating over all the files in a directory, and checking if the filename ends in .pdf
. If it does, use os.path.join
to give you the full filepath, and append it to the filenames
list.
You could also use the glob
module, too.
import os
filenames = []
directory = r"C:StuffPDF Files"
for filename in os.listdir(directory):
if filename.lower().endswith(".pdf"):
filenames.append(os.path.join(directory, filename))
Updated code:
import PyPDF2
import openpyxl
import os
import glob
import re
import itertools
# Used to strip characters that can't be written to a spreadsheet
# See https://stackoverflow.com/a/93029/3589122
control_chars = ''.join(map(chr, itertools.chain(range(0x00,0x20), range(0x7f,0xa0))))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
def remove_control_chars(s):
return control_char_re.sub('', s)
root_dir = 'your directory' # root_dir needs a trailing slash (i.e. /root/dir/)
filenames = (filename for filename in glob.iglob(root_dir + '/**/*.pdf', recursive=True))
wb = openpyxl.load_workbook('excel.xlsx') # your file excel
sheet = wb.active
sheet.title = 'MyPDF'
row = 1
for filename in filenames:
with open(filename, 'rb') as f:
try:
pdfReader = PyPDF2.PdfFileReader(f)
count = pdfReader.numPages
output = []
for i in range(count):
print(i, filename)
page = pdfReader.getPage(i)
output.append(page.extractText())
#print(output)
except Exception as e:
print(f'Error: PyPDF2 could not read {filename}. Continuing... ({e})')
continue
sheet[f'A{row}'].value = 'n'.join(remove_control_chars(output))
sheet[f'B{row}'].value = filename
row += 1
wb.save('excel.xlsx') #your file excel
print('DONE!!')
have you tried with more than 6/7 files? i get this error with 7 pdf
TypeError Traceback (most recent call last)
<ipython-input-14-07fb0aa603b8> in <module>
23 for i in range(count):
24 page = pdfReader.getPage(i)
---> 25 output.append(page.extractText())
26 print(output)
27
~anaconda3libsite-packagesPyPDF2_page.py in extractText(self, Tj_sep, TJ_sep)
1283 """
1284 deprecate_with_replacement("extractText", "extract_text")
-> 1285 return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)
1286
1287 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
~anaconda3libsite-packagesPyPDF2_page.py in extract_text(self, Tj_sep, TJ_sep, space_width)
1261 :return: a string object.
1262 """
-> 1263 return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
1264
1265 def extract_xform_text(
~anaconda3libsite-packagesPyPDF2_page.py in _extract_text(self, obj, pdf, space_width, content_key)
1243 text = ""
1244 else:
-> 1245 process_operation(operator, operands)
1246 output += text # just in case of
1247 return output
~anaconda3libsite-packagesPyPDF2_page.py in process_operation(operator, operands)
1195 tm_matrix[5] -= TL
1196 elif operator == b"Tj":
-> 1197 text += operands[0].translate(cmap)
1198 else:
1199 return None
TypeError: a bytes-like object is required, not 'dict'