How to get bookmark's page number
Question:
from typing import List
from PyPDF2 import PdfFileReader
from PyPDF2.generic import Destination
def get_outlines(pdf_filepath: str) -> List[Destination]:
"""Get the bookmarks of a PDF file."""
with open(pdf_filepath, "rb") as fp:
pdf_file_reader = PdfFileReader(fp)
outlines = pdf_file_reader.getOutlines()
return outlines
print(get_outlines("PDF-export-example.pdf"))
pyPdf.pdf.Destination
has many properties, but I can’t find any referring page number of that bookmark. How can I get the page number of the bookmarks?
For example outlines[1].page.idnum
returns a number which is approximately 3 times bigger than referenced page number in PDF document, which I assume references some object smaller then page, as running .page.idnum
on whole PDF document outline returns array of numbers which is not even linearly correlated with "real" page number destinations in PDF document and it’s roughly multiple by ~ 3
Update: This question is same as this: split a pdf based on outline although I don’t understand what author did in his self answer there. Seems too complicated to me to be usable
Answers:
I’m not sure but according to the docs for pypdf.Destination the page number for the bookmark is just Destination.page .
As @theta pointed out “split a pdf based on outline” has the code required to extract page numbers. If you feel this is complicated I copied part of the code which maps page ids to page numbers and made it a function. Here is a working example that prints page number of bookmark o[0]:
from PyPDF2 import PdfFileReader
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
# main
f = open('document.pdf','rb')
p = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(p)
o = p.getOutlines()
pg_num = pg_id_num_map[o[0].page.idnum] + 1
print(pg_num)
probably too late for @theta but might help others 🙂 btw my first post on stackoverflow so excuse me if I did not follow the usual format
To extend this further:
If you are looking to get the exact location on the page for a bookmark this will make your job easier:
from PyPDF2 import PdfFileReader
import PyPDF2 as pyPdf
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None):
if result is None:
result = dict()
if type(outlines) == list:
for outline in outlines:
result = outlines_pg_zoom_info(outline, pg_id_num_map, result)
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'],
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
return result
# main
pdf_name = 'document.pdf'
f = open(pdf_name,'rb')
pdf = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(pdf)
outlines = pdf.getOutlines()
bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)
print(bookmarks_info)
Note: My bookmarks are section numbers (ex: 1.1 Introduction) and I am mapping the bookmark info to the section number. If your bookmarks are different modify this part of the code:
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'],
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
In 2019, for ones who are interested in a faster way, it’s possible to use:
from PyPDF2 import PdfFileReader
def printPageNumberFrom(filename):
with open(filename, "rb") as f:
pdf = PdfFileReader(f)
bookmarks = pdf.getOutlines()
for b in bookmarks:
print(pdf.getDestinationPageNumber(b) + 1) #page count starts from 0
Manage bookmarks recursively with vjayky and Giulio D suggestion
PyPDF2 >= v1.25
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
def review_and_print_bookmarks(bookmarks, lvl=0):
for b in bookmarks:
if type(b) == list:
review_and_print_bookmarks(b, lvl + 4)
continue
pg_num = pdf.getDestinationPageNumber(b) + 1 #page count starts from 0
print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
review_and_print_bookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
PyPDF2 < v1.25
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
# Map page ids to page numbers
pg_id_to_num = {}
for pg_num in range(0, pdf.getNumPages()):
pg_id_to_num[pdf.getPage(pg_num).indirectRef.idnum] = pg_num
def review_and_print_bookmarks(bookmarks, lvl=0):
for b in bookmarks:
if type(b) == list:
review_and_print_bookmarks(b, lvl + 4)
continue
pg_num = pg_id_to_num[b.page.idnum] + 1 #page count starts from 0
print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
review_and_print_bookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
from typing import List
from PyPDF2 import PdfFileReader
from PyPDF2.generic import Destination
def get_outlines(pdf_filepath: str) -> List[Destination]:
"""Get the bookmarks of a PDF file."""
with open(pdf_filepath, "rb") as fp:
pdf_file_reader = PdfFileReader(fp)
outlines = pdf_file_reader.getOutlines()
return outlines
print(get_outlines("PDF-export-example.pdf"))
pyPdf.pdf.Destination
has many properties, but I can’t find any referring page number of that bookmark. How can I get the page number of the bookmarks?
For example outlines[1].page.idnum
returns a number which is approximately 3 times bigger than referenced page number in PDF document, which I assume references some object smaller then page, as running .page.idnum
on whole PDF document outline returns array of numbers which is not even linearly correlated with "real" page number destinations in PDF document and it’s roughly multiple by ~ 3
Update: This question is same as this: split a pdf based on outline although I don’t understand what author did in his self answer there. Seems too complicated to me to be usable
I’m not sure but according to the docs for pypdf.Destination the page number for the bookmark is just Destination.page .
As @theta pointed out “split a pdf based on outline” has the code required to extract page numbers. If you feel this is complicated I copied part of the code which maps page ids to page numbers and made it a function. Here is a working example that prints page number of bookmark o[0]:
from PyPDF2 import PdfFileReader
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
# main
f = open('document.pdf','rb')
p = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(p)
o = p.getOutlines()
pg_num = pg_id_num_map[o[0].page.idnum] + 1
print(pg_num)
probably too late for @theta but might help others 🙂 btw my first post on stackoverflow so excuse me if I did not follow the usual format
To extend this further:
If you are looking to get the exact location on the page for a bookmark this will make your job easier:
from PyPDF2 import PdfFileReader
import PyPDF2 as pyPdf
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None):
if result is None:
result = dict()
if type(outlines) == list:
for outline in outlines:
result = outlines_pg_zoom_info(outline, pg_id_num_map, result)
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'],
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
return result
# main
pdf_name = 'document.pdf'
f = open(pdf_name,'rb')
pdf = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(pdf)
outlines = pdf.getOutlines()
bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)
print(bookmarks_info)
Note: My bookmarks are section numbers (ex: 1.1 Introduction) and I am mapping the bookmark info to the section number. If your bookmarks are different modify this part of the code:
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'],
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
In 2019, for ones who are interested in a faster way, it’s possible to use:
from PyPDF2 import PdfFileReader
def printPageNumberFrom(filename):
with open(filename, "rb") as f:
pdf = PdfFileReader(f)
bookmarks = pdf.getOutlines()
for b in bookmarks:
print(pdf.getDestinationPageNumber(b) + 1) #page count starts from 0
Manage bookmarks recursively with vjayky and Giulio D suggestion
PyPDF2 >= v1.25
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
def review_and_print_bookmarks(bookmarks, lvl=0):
for b in bookmarks:
if type(b) == list:
review_and_print_bookmarks(b, lvl + 4)
continue
pg_num = pdf.getDestinationPageNumber(b) + 1 #page count starts from 0
print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
review_and_print_bookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
PyPDF2 < v1.25
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
# Map page ids to page numbers
pg_id_to_num = {}
for pg_num in range(0, pdf.getNumPages()):
pg_id_to_num[pdf.getPage(pg_num).indirectRef.idnum] = pg_num
def review_and_print_bookmarks(bookmarks, lvl=0):
for b in bookmarks:
if type(b) == list:
review_and_print_bookmarks(b, lvl + 4)
continue
pg_num = pg_id_to_num[b.page.idnum] + 1 #page count starts from 0
print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
review_and_print_bookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)