Extract text per page with Python pdfMiner?
Question:
I have experimented with both pypdf and pdfMiner to extract text from PDF files. I have some unfriendly PDFs that only pdfMiner is able to extract successfully. I am using the code here to extract text for the entire file. However, I would really like to extract text on a per page basis like the pages[i].extract_text()
functionality in pypdf. Does anyone know how to extract text per page using pdfMiner?
Answers:
for pageNumber, page in enumerate(PDFDocument.get_pages()):
if pageNumber == 42:
#do something with the page
There is a pretty good article here.
This is how you write all the pages to separate files:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
import io
import os
fp = open('Files/Company_list/0010/pdf_files/testfile3.pdf', 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
print(type(retstr))
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_no = 0
for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
if pageNumber == page_no:
interpreter.process_page(page)
data = retstr.getvalue()
with open(os.path.join('Files/Company_list/0010/text_parsed/2017AR', f'pdf page {page_no}.txt'), 'wb') as file:
file.write(data.encode('utf-8'))
data = ''
retstr.truncate(0)
retstr.seek(0)
page_no += 1
Just replace page_no with page number you want if you want specific page numbers.
I have experimented with both pypdf and pdfMiner to extract text from PDF files. I have some unfriendly PDFs that only pdfMiner is able to extract successfully. I am using the code here to extract text for the entire file. However, I would really like to extract text on a per page basis like the pages[i].extract_text()
functionality in pypdf. Does anyone know how to extract text per page using pdfMiner?
for pageNumber, page in enumerate(PDFDocument.get_pages()):
if pageNumber == 42:
#do something with the page
There is a pretty good article here.
This is how you write all the pages to separate files:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
import io
import os
fp = open('Files/Company_list/0010/pdf_files/testfile3.pdf', 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
print(type(retstr))
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_no = 0
for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
if pageNumber == page_no:
interpreter.process_page(page)
data = retstr.getvalue()
with open(os.path.join('Files/Company_list/0010/text_parsed/2017AR', f'pdf page {page_no}.txt'), 'wb') as file:
file.write(data.encode('utf-8'))
data = ''
retstr.truncate(0)
retstr.seek(0)
page_no += 1
Just replace page_no with page number you want if you want specific page numbers.