Using Python to pull the number of pages in all the pdf documents in a directory
Question:
I am trying to use PyPDF2 to grab the number of pages of every pdf in a directory. I can use .getNumPages() to find the number of pages in one pdf file but I need to walk through a directory and get the number of pages for every file. Any ideas?
Here is the code I have so far:
import pandas as pd
import os
from PyPDF2 import PdfFileReader
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
pdf=PdfFileReader(open('path/to/file.pdf','rb'))
for root, dirs, files in os.walk(r'Directory path'):
for file in files:
if file.endswith(".pdf"):
df2 = pd.DataFrame([[file, os.path.join(root,file),pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
df = df.append(df2, ignore_index=True)
This code will just add the number of pages from the first PDF file in the directory to the dataframe. If I try to add a directory path to PdfFilereader() I get a
PermissionError:[Errno 13] Permission denied.
Answers:
Yeah, use
import glob
list_of_pdf_filenames = glob.glob('*pdf')
to return the list of all PDF filenames in a directory.
**Edit: **
By placing the open() statement inside the loop, I was able to get this code to run on my computer:
import pandas as pd
import os
from PyPDF2 import PdfFileReader
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
for root, dirs, files in os.walk(r'/home/benjamin/docs/'):
for f in files:
if f.endswith(".pdf"):
pdf=PdfFileReader(open(os.path.join(root, f),'rb'))
df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
df = df.append(df2, ignore_index=True)
print(df.head)
step 1:-
pip install pyPDF2
step 2:-
import requests, PyPDF2, io
url = 'sample.pdf'
response = requests.get(url)
with io.BytesIO(response.content) as open_pdf_file:
read_pdf = PyPDF2.PdfFileReader(open_pdf_file)
num_pages = read_pdf.getNumPages()
print(num_pages)
If you want to use newer pypdf
version here is the code.
Only thing you need to install is pypdf
pip install pypdf
Than you can run:
from pathlib import Path
from typing import Mapping
from pypdf import PdfReader
directory = Path("C://YourDirToSearch/")
def get_num_pages(pdf_file: Path) -> int:
reader = PdfReader(pdf_file)
return len(reader.pages)
def get_pdf_page_numbers(directory: Path) -> Mapping[Path, int]:
return {file: get_num_pages(file) for file in directory.glob("*.pdf")}
print(get_pdf_page_numbers(directory))
As a result you get something like:
{
"path1.pdf": 1,
"path2.pdf": 2,
}
I am trying to use PyPDF2 to grab the number of pages of every pdf in a directory. I can use .getNumPages() to find the number of pages in one pdf file but I need to walk through a directory and get the number of pages for every file. Any ideas?
Here is the code I have so far:
import pandas as pd
import os
from PyPDF2 import PdfFileReader
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
pdf=PdfFileReader(open('path/to/file.pdf','rb'))
for root, dirs, files in os.walk(r'Directory path'):
for file in files:
if file.endswith(".pdf"):
df2 = pd.DataFrame([[file, os.path.join(root,file),pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
df = df.append(df2, ignore_index=True)
This code will just add the number of pages from the first PDF file in the directory to the dataframe. If I try to add a directory path to PdfFilereader() I get a
PermissionError:[Errno 13] Permission denied.
Yeah, use
import glob
list_of_pdf_filenames = glob.glob('*pdf')
to return the list of all PDF filenames in a directory.
**Edit: **
By placing the open() statement inside the loop, I was able to get this code to run on my computer:
import pandas as pd
import os
from PyPDF2 import PdfFileReader
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
for root, dirs, files in os.walk(r'/home/benjamin/docs/'):
for f in files:
if f.endswith(".pdf"):
pdf=PdfFileReader(open(os.path.join(root, f),'rb'))
df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
df = df.append(df2, ignore_index=True)
print(df.head)
step 1:-
pip install pyPDF2
step 2:-
import requests, PyPDF2, io
url = 'sample.pdf'
response = requests.get(url)
with io.BytesIO(response.content) as open_pdf_file:
read_pdf = PyPDF2.PdfFileReader(open_pdf_file)
num_pages = read_pdf.getNumPages()
print(num_pages)
If you want to use newer pypdf
version here is the code.
Only thing you need to install is pypdf
pip install pypdf
Than you can run:
from pathlib import Path
from typing import Mapping
from pypdf import PdfReader
directory = Path("C://YourDirToSearch/")
def get_num_pages(pdf_file: Path) -> int:
reader = PdfReader(pdf_file)
return len(reader.pages)
def get_pdf_page_numbers(directory: Path) -> Mapping[Path, int]:
return {file: get_num_pages(file) for file in directory.glob("*.pdf")}
print(get_pdf_page_numbers(directory))
As a result you get something like:
{
"path1.pdf": 1,
"path2.pdf": 2,
}