Extract specific pages of PDF and save it with Python

Question

I have some sources and tried to code which extract some pages and create pdf files.
I have a list which looks like this

information = [(filename1,startpage1,endpage1), (filename2, startpage2, endpage2), ...,(filename19,startpage19,endpage19)].

This is my code.

from PyPDF2 import PdfFileReader, PdfFileWriter

reader = PdfFileReader("example.pdf")

for page in range(reader.getNumPages() - 1):
    writer = PdfFileWriter()
    start = information[page][1]
    end = information[page][2]
    while start < end:
        writer.addPage(reader.getPage(start))
        start += 1
        output_filename = "{}_{}_page_{}.pdf".format(
            information[page][0], information[page][1], information[page][2]
        )
    with open(output_filename, "wb") as out:
        writer.write(out)

But the output is weird.. some has nothing inside and some has just one page in it. How can I correct this?

Asked By: SSS

||

Source

Answer 1

I have fixed the issue. it was the equal sign (start<=end).

for page in range(len(information)):
    pdf_writer = PyPDF2.PdfFileWriter()
    start = information[page][1]
    end = information[page][2]
    while start<=end:
        pdf_writer.addPage(pdfReader.getPage(start-1))
        start+=1
    if not os.path.exists(savepath):
        os.makedirs(savepath)
    output_filename = '{}_{}_page_{}.pdf'.format(information[page][0],information[page][1], information[page][2])
    with open(output_filename,'wb') as out:
        pdf_writer.write(out)

Answered By: SSS

Answer 2

Full code and I modified SSS’ answer to be portable, flexible, and concurrent with multiple source pdfs.
I couldn’t test the performance difference between ThreadPoolExecutor and ProcessPoolExecutor, but I assumed that the extraction process is bounded by the reading and writing of PDFs rather than by getPage and addPage.

import concurrent.futures
from multiprocessing import freeze_support
from pathlib import Path
from PyPDF2 import PdfFileReader, PdfFileWriter


def pdf_extract(pdf, segments):
    """
    pdf: str | Path
    segments: [(start, end), {'start': int, 'end': int}]
    """
    with open(pdf, 'rb') as read_stream:
        pdf_reader = PdfFileReader(read_stream)
        for segment in segments:
            pdf_writer = PdfFileWriter()
            # support {'start': 3, 'end': 3} or (start, end)
            try:
                start_page, end_page = segment['start'], segment['end']
            except TypeError:
                start_page, end_page = segment
            for page_num in range(start_page - 1, end_page):
                pdf_writer.addPage(pdf_reader.getPage(page_num))
            p = Path(pdf)
            ouput = p.parent / p.with_stem(f'{p.stem}_pages_{start_page}-{end_page}')
            with open(ouput, 'wb') as out:
                pdf_writer.write(out)


def __pdf_extract(pair):
    return pdf_extract(*pair)


def pdf_extract_batch(pdfs, workers=20):
    """
    pdfs = {pdf_name: [(1, 1), ...], ...}
    """
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
        executor.map(__pdf_extract, pdfs.items())


if __name__ == '__main__':
    freeze_support()
    pdf_name = r'C:UsersmasteDocumentslong.pdf'
    segments = [(1, 1), {'start': 3, 'end': 5}]
    # Single
    pdf_extract(pdf_name, segments)
    # Batched (Concurrent)
    pdfs = {pdf_name: segments}
    # pdf_extract_batch(pdfs)

Answered By: Elijah

Answer 3

The older answers are good, but there have been some changes after PyPDF version 3.0.0. Here’s an updated alternative (basically the reader and writer classes were renamed, and getPage() is now pages[]). If you only need one fragment simply remove the first loop.

from PyPDF2 import PdfReader, PdfWriter


pdf_reader = PdfReader(source_pdf_file_path)
pages = [(1, 3), (2, 6)]
for page_indices in pages:
    pdf_writer = PdfWriter()  # we want to reset this when starting a new pdf
    for idx in range(page_indices[0] - 1, page_indices[1]):
        pdf_writer.add_page(pdf_reader.pages[idx])
    output_filename = f"{out_folder}/{source_pdf_file_path.stem}_{page_indices[0]}.pdf"
    with open(output_filename, "wb") as out:
        pdf_writer.write(out)

Answered By: Pablo

Answer 4

my quick solution :

    from PyPDF2 import PdfReader, PdfWriter
    pdf_file_path = 'file.pdf'
    file_base_name = pdf_file_path.replace('.pdf', '')

    pdf = PdfReader(pdf_file_path)
    print('ca passe avant le decoupage')
    pages = [0, 2, 4] # page 1, 3, 5
    pdfWriter = PdfWriter()

    for page_num in pages:
        pdfWriter.add_page(pdf.pages[page_num])

    with open('{0}_subset.pdf'.format(file_base_name), 'wb') as f:
        pdfWriter.write(f)
        f.close()

Answered By: Ay.AZ

Extract specific pages of PDF and save it with Python

Question:

Answers: