Extract specific pages of PDF and save it with Python
Question:
I have some sources and tried to code which extract some pages and create pdf files.
I have a list which looks like this
information = [(filename1,startpage1,endpage1), (filename2, startpage2, endpage2), ...,(filename19,startpage19,endpage19)].
This is my code.
from PyPDF2 import PdfFileReader, PdfFileWriter
reader = PdfFileReader("example.pdf")
for page in range(reader.getNumPages() - 1):
writer = PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start < end:
writer.addPage(reader.getPage(start))
start += 1
output_filename = "{}_{}_page_{}.pdf".format(
information[page][0], information[page][1], information[page][2]
)
with open(output_filename, "wb") as out:
writer.write(out)
But the output is weird.. some has nothing inside and some has just one page in it. How can I correct this?
Answers:
I have fixed the issue. it was the equal sign (start<=end).
for page in range(len(information)):
pdf_writer = PyPDF2.PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start<=end:
pdf_writer.addPage(pdfReader.getPage(start-1))
start+=1
if not os.path.exists(savepath):
os.makedirs(savepath)
output_filename = '{}_{}_page_{}.pdf'.format(information[page][0],information[page][1], information[page][2])
with open(output_filename,'wb') as out:
pdf_writer.write(out)
Full code and I modified SSS’ answer to be portable, flexible, and concurrent with multiple source pdfs.
I couldn’t test the performance difference between ThreadPoolExecutor and ProcessPoolExecutor, but I assumed that the extraction process is bounded by the reading and writing of PDFs rather than by getPage and addPage.
import concurrent.futures
from multiprocessing import freeze_support
from pathlib import Path
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_extract(pdf, segments):
"""
pdf: str | Path
segments: [(start, end), {'start': int, 'end': int}]
"""
with open(pdf, 'rb') as read_stream:
pdf_reader = PdfFileReader(read_stream)
for segment in segments:
pdf_writer = PdfFileWriter()
# support {'start': 3, 'end': 3} or (start, end)
try:
start_page, end_page = segment['start'], segment['end']
except TypeError:
start_page, end_page = segment
for page_num in range(start_page - 1, end_page):
pdf_writer.addPage(pdf_reader.getPage(page_num))
p = Path(pdf)
ouput = p.parent / p.with_stem(f'{p.stem}_pages_{start_page}-{end_page}')
with open(ouput, 'wb') as out:
pdf_writer.write(out)
def __pdf_extract(pair):
return pdf_extract(*pair)
def pdf_extract_batch(pdfs, workers=20):
"""
pdfs = {pdf_name: [(1, 1), ...], ...}
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
executor.map(__pdf_extract, pdfs.items())
if __name__ == '__main__':
freeze_support()
pdf_name = r'C:UsersmasteDocumentslong.pdf'
segments = [(1, 1), {'start': 3, 'end': 5}]
# Single
pdf_extract(pdf_name, segments)
# Batched (Concurrent)
pdfs = {pdf_name: segments}
# pdf_extract_batch(pdfs)
The older answers are good, but there have been some changes after PyPDF version 3.0.0. Here’s an updated alternative (basically the reader and writer classes were renamed, and getPage()
is now pages[]
). If you only need one fragment simply remove the first loop.
from PyPDF2 import PdfReader, PdfWriter
pdf_reader = PdfReader(source_pdf_file_path)
pages = [(1, 3), (2, 6)]
for page_indices in pages:
pdf_writer = PdfWriter() # we want to reset this when starting a new pdf
for idx in range(page_indices[0] - 1, page_indices[1]):
pdf_writer.add_page(pdf_reader.pages[idx])
output_filename = f"{out_folder}/{source_pdf_file_path.stem}_{page_indices[0]}.pdf"
with open(output_filename, "wb") as out:
pdf_writer.write(out)
my quick solution :
from PyPDF2 import PdfReader, PdfWriter
pdf_file_path = 'file.pdf'
file_base_name = pdf_file_path.replace('.pdf', '')
pdf = PdfReader(pdf_file_path)
print('ca passe avant le decoupage')
pages = [0, 2, 4] # page 1, 3, 5
pdfWriter = PdfWriter()
for page_num in pages:
pdfWriter.add_page(pdf.pages[page_num])
with open('{0}_subset.pdf'.format(file_base_name), 'wb') as f:
pdfWriter.write(f)
f.close()
I have some sources and tried to code which extract some pages and create pdf files.
I have a list which looks like this
information = [(filename1,startpage1,endpage1), (filename2, startpage2, endpage2), ...,(filename19,startpage19,endpage19)].
This is my code.
from PyPDF2 import PdfFileReader, PdfFileWriter
reader = PdfFileReader("example.pdf")
for page in range(reader.getNumPages() - 1):
writer = PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start < end:
writer.addPage(reader.getPage(start))
start += 1
output_filename = "{}_{}_page_{}.pdf".format(
information[page][0], information[page][1], information[page][2]
)
with open(output_filename, "wb") as out:
writer.write(out)
But the output is weird.. some has nothing inside and some has just one page in it. How can I correct this?
I have fixed the issue. it was the equal sign (start<=end).
for page in range(len(information)):
pdf_writer = PyPDF2.PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start<=end:
pdf_writer.addPage(pdfReader.getPage(start-1))
start+=1
if not os.path.exists(savepath):
os.makedirs(savepath)
output_filename = '{}_{}_page_{}.pdf'.format(information[page][0],information[page][1], information[page][2])
with open(output_filename,'wb') as out:
pdf_writer.write(out)
Full code and I modified SSS’ answer to be portable, flexible, and concurrent with multiple source pdfs.
I couldn’t test the performance difference between ThreadPoolExecutor and ProcessPoolExecutor, but I assumed that the extraction process is bounded by the reading and writing of PDFs rather than by getPage and addPage.
import concurrent.futures
from multiprocessing import freeze_support
from pathlib import Path
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_extract(pdf, segments):
"""
pdf: str | Path
segments: [(start, end), {'start': int, 'end': int}]
"""
with open(pdf, 'rb') as read_stream:
pdf_reader = PdfFileReader(read_stream)
for segment in segments:
pdf_writer = PdfFileWriter()
# support {'start': 3, 'end': 3} or (start, end)
try:
start_page, end_page = segment['start'], segment['end']
except TypeError:
start_page, end_page = segment
for page_num in range(start_page - 1, end_page):
pdf_writer.addPage(pdf_reader.getPage(page_num))
p = Path(pdf)
ouput = p.parent / p.with_stem(f'{p.stem}_pages_{start_page}-{end_page}')
with open(ouput, 'wb') as out:
pdf_writer.write(out)
def __pdf_extract(pair):
return pdf_extract(*pair)
def pdf_extract_batch(pdfs, workers=20):
"""
pdfs = {pdf_name: [(1, 1), ...], ...}
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
executor.map(__pdf_extract, pdfs.items())
if __name__ == '__main__':
freeze_support()
pdf_name = r'C:UsersmasteDocumentslong.pdf'
segments = [(1, 1), {'start': 3, 'end': 5}]
# Single
pdf_extract(pdf_name, segments)
# Batched (Concurrent)
pdfs = {pdf_name: segments}
# pdf_extract_batch(pdfs)
The older answers are good, but there have been some changes after PyPDF version 3.0.0. Here’s an updated alternative (basically the reader and writer classes were renamed, and getPage()
is now pages[]
). If you only need one fragment simply remove the first loop.
from PyPDF2 import PdfReader, PdfWriter
pdf_reader = PdfReader(source_pdf_file_path)
pages = [(1, 3), (2, 6)]
for page_indices in pages:
pdf_writer = PdfWriter() # we want to reset this when starting a new pdf
for idx in range(page_indices[0] - 1, page_indices[1]):
pdf_writer.add_page(pdf_reader.pages[idx])
output_filename = f"{out_folder}/{source_pdf_file_path.stem}_{page_indices[0]}.pdf"
with open(output_filename, "wb") as out:
pdf_writer.write(out)
my quick solution :
from PyPDF2 import PdfReader, PdfWriter
pdf_file_path = 'file.pdf'
file_base_name = pdf_file_path.replace('.pdf', '')
pdf = PdfReader(pdf_file_path)
print('ca passe avant le decoupage')
pages = [0, 2, 4] # page 1, 3, 5
pdfWriter = PdfWriter()
for page_num in pages:
pdfWriter.add_page(pdf.pages[page_num])
with open('{0}_subset.pdf'.format(file_base_name), 'wb') as f:
pdfWriter.write(f)
f.close()