Add text to Existing PDF using Python
Question:
I need to add some extra text to an existing PDF using Python, what is the best way to go about this and what extra modules will I need to install.
Note: Ideally I would like to be able to run this on both Windows and Linux, but at a push Linux only will do.
Edit: pypdf and ReportLab look good but neither one will allow me to edit an existing PDF, are there any other options?
Answers:
You may have better luck breaking the problem down into converting PDF into an editable format, writing your changes, then converting it back into PDF. I don’t know of a library that lets you directly edit PDF but there are plenty of converters between DOC and PDF for example.
If you’re on Windows, this might work:
There’s also a whitepaper of a PDF creation and editing framework in Python. It’s a little dated, but maybe can give you some useful info:
I know this is an older post, but I spent a long time trying to find a solution. I came across a decent one using only ReportLab and PyPDF so I thought I’d share:
- read your PDF using
PdfFileReader()
, we’ll call this input
- create a new pdf containing your text to add using ReportLab, save this as a string object
- read the string object using
PdfFileReader()
, we’ll call this text
- create a new PDF object using
PdfFileWriter()
, we’ll call this output
- iterate through input and apply
.mergePage(*text*.getPage(0))
for each page you want the text added to, then use output.addPage()
to add the modified pages to a new document
This works well for simple text additions. See PyPDF’s sample for watermarking a document.
Here is some code to answer the question below:
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
<do something with canvas>
can.save()
packet.seek(0)
input = PdfFileReader(packet)
From here you can merge the pages of the input file with another document.
Example for [Python 2.7]:
from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(10, 100, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(file("original.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = file("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
Example for Python 3.x:
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(10, 100, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(open("original.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.pages[0]
page.merge_page(new_pdf.pages[0])
output.add_page(page)
# finally, write "output" to a real file
output_stream = open("destination.pdf", "wb")
output.write(output_stream)
output_stream.close()
cpdf will do the job from the command-line. It isn’t python, though (afaik):
cpdf -add-text "Line of text" input.pdf -o output .pdf
pdfrw will let you read in pages from an existing PDF and draw them to a reportlab canvas (similar to drawing an image). There are examples for this in the pdfrw examples/rl1 subdirectory on github. Disclaimer: I am the pdfrw author.
Leveraging David Dehghan‘s answer above, the following works in Python 2.7.13:
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = StringIO.StringIO()
# create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(290, 720, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader("original.pdf")
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
Don’t use mergePage, It may not work for some pdfs
You should use mergeRotatedTranslatedPage
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen.canvas import Canvas
page_to_merge = 0 #Refers to the First page of PDF
xcoor = 250 #To be changed according to your pdf
ycoor = 650 #To be changed according to your pdf
input_pdf = PdfFileReader(open("Source.pdf", "rb"))
page_count = input_pdf.getNumPages()
inputpdf_page_to_be_merged = input_pdf.getPage(page_to_merge)
packet = io.BytesIO()
c = Canvas(packet,pagesize=(inputpdf_page_to_be_merged.mediaBox.getWidth(),inputpdf_page_to_be_merged.mediaBox.getHeight()))
c.drawString(xcoor,ycoor,"Hello World")
c.save()
packet.seek(0)
overlay_pdf = PdfFileReader(packet)
overlay = overlay_pdf.getPage(0)
output = PdfFileWriter()
for PAGE in range(page_count):
if PAGE == page_to_merge:
inputpdf_page_to_be_merged.mergeRotatedTranslatedPage(overlay,
inputpdf_page_to_be_merged.get('/Rotate') or 0,
overlay.mediaBox.getWidth()/2, overlay.mediaBox.getWidth()/2)
output.addPage(inputpdf_page_to_be_merged)
else:
Page_in_pdf = input_pdf.getPage(PAGE)
output.addPage(Page_in_pdf)
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
The PyPDF2 as of date of writing has depreciated the PdfFileReader, PdfFileWriter and few other methods and changed it to different names and methods and has also changed methods like getPage() directly to attribute of PdfReader.
Here is a very Simple Class to add text to existing pdf file:
(Use is demonstrated at end)
from PyPDF2 import PdfWriter, PdfReader, Transformation
import io
from reportlab.pdfgen.canvas import Canvas
class GenerateFromTemplate:
def __init__(self,template):
self.template_pdf = PdfReader(open(template, "rb"))
self.template_page= self.template_pdf.pages[0]
self.packet = io.BytesIO()
self.c = Canvas(self.packet,pagesize=(self.template_page.mediabox.width,self.template_page.mediabox.height))
def addText(self,text,point):
self.c.drawString(point[0],point[1],text)
def merge(self):
self.c.save()
self.packet.seek(0)
result_pdf = PdfReader(self.packet)
result = result_pdf.pages[0]
self.output = PdfWriter()
op = Transformation().rotate(0).translate(tx=0, ty=0)
result.add_transformation(op)
self.template_page.merge_page(result)
self.output.add_page(self.template_page)
def generate(self,dest):
outputStream = open(dest,"wb")
self.output.write(outputStream)
outputStream.close()
"""
Use as:
gen = GenerateFromTemplate("template.pdf")
gen.addText("Hello!",(100,200))
gen.addText("PDF!",(100,300))
gen.merge()
gen.generate("Output.pdf")
"""
Hope this helps.
I need to add some extra text to an existing PDF using Python, what is the best way to go about this and what extra modules will I need to install.
Note: Ideally I would like to be able to run this on both Windows and Linux, but at a push Linux only will do.
Edit: pypdf and ReportLab look good but neither one will allow me to edit an existing PDF, are there any other options?
You may have better luck breaking the problem down into converting PDF into an editable format, writing your changes, then converting it back into PDF. I don’t know of a library that lets you directly edit PDF but there are plenty of converters between DOC and PDF for example.
If you’re on Windows, this might work:
There’s also a whitepaper of a PDF creation and editing framework in Python. It’s a little dated, but maybe can give you some useful info:
I know this is an older post, but I spent a long time trying to find a solution. I came across a decent one using only ReportLab and PyPDF so I thought I’d share:
- read your PDF using
PdfFileReader()
, we’ll call this input - create a new pdf containing your text to add using ReportLab, save this as a string object
- read the string object using
PdfFileReader()
, we’ll call this text - create a new PDF object using
PdfFileWriter()
, we’ll call this output - iterate through input and apply
.mergePage(*text*.getPage(0))
for each page you want the text added to, then useoutput.addPage()
to add the modified pages to a new document
This works well for simple text additions. See PyPDF’s sample for watermarking a document.
Here is some code to answer the question below:
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
<do something with canvas>
can.save()
packet.seek(0)
input = PdfFileReader(packet)
From here you can merge the pages of the input file with another document.
Example for [Python 2.7]:
from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(10, 100, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(file("original.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = file("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
Example for Python 3.x:
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(10, 100, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(open("original.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.pages[0]
page.merge_page(new_pdf.pages[0])
output.add_page(page)
# finally, write "output" to a real file
output_stream = open("destination.pdf", "wb")
output.write(output_stream)
output_stream.close()
cpdf will do the job from the command-line. It isn’t python, though (afaik):
cpdf -add-text "Line of text" input.pdf -o output .pdf
pdfrw will let you read in pages from an existing PDF and draw them to a reportlab canvas (similar to drawing an image). There are examples for this in the pdfrw examples/rl1 subdirectory on github. Disclaimer: I am the pdfrw author.
Leveraging David Dehghan‘s answer above, the following works in Python 2.7.13:
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = StringIO.StringIO()
# create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(290, 720, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader("original.pdf")
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
Don’t use mergePage, It may not work for some pdfs
You should use mergeRotatedTranslatedPage
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen.canvas import Canvas
page_to_merge = 0 #Refers to the First page of PDF
xcoor = 250 #To be changed according to your pdf
ycoor = 650 #To be changed according to your pdf
input_pdf = PdfFileReader(open("Source.pdf", "rb"))
page_count = input_pdf.getNumPages()
inputpdf_page_to_be_merged = input_pdf.getPage(page_to_merge)
packet = io.BytesIO()
c = Canvas(packet,pagesize=(inputpdf_page_to_be_merged.mediaBox.getWidth(),inputpdf_page_to_be_merged.mediaBox.getHeight()))
c.drawString(xcoor,ycoor,"Hello World")
c.save()
packet.seek(0)
overlay_pdf = PdfFileReader(packet)
overlay = overlay_pdf.getPage(0)
output = PdfFileWriter()
for PAGE in range(page_count):
if PAGE == page_to_merge:
inputpdf_page_to_be_merged.mergeRotatedTranslatedPage(overlay,
inputpdf_page_to_be_merged.get('/Rotate') or 0,
overlay.mediaBox.getWidth()/2, overlay.mediaBox.getWidth()/2)
output.addPage(inputpdf_page_to_be_merged)
else:
Page_in_pdf = input_pdf.getPage(PAGE)
output.addPage(Page_in_pdf)
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
The PyPDF2 as of date of writing has depreciated the PdfFileReader, PdfFileWriter and few other methods and changed it to different names and methods and has also changed methods like getPage() directly to attribute of PdfReader.
Here is a very Simple Class to add text to existing pdf file:
(Use is demonstrated at end)
from PyPDF2 import PdfWriter, PdfReader, Transformation
import io
from reportlab.pdfgen.canvas import Canvas
class GenerateFromTemplate:
def __init__(self,template):
self.template_pdf = PdfReader(open(template, "rb"))
self.template_page= self.template_pdf.pages[0]
self.packet = io.BytesIO()
self.c = Canvas(self.packet,pagesize=(self.template_page.mediabox.width,self.template_page.mediabox.height))
def addText(self,text,point):
self.c.drawString(point[0],point[1],text)
def merge(self):
self.c.save()
self.packet.seek(0)
result_pdf = PdfReader(self.packet)
result = result_pdf.pages[0]
self.output = PdfWriter()
op = Transformation().rotate(0).translate(tx=0, ty=0)
result.add_transformation(op)
self.template_page.merge_page(result)
self.output.add_page(self.template_page)
def generate(self,dest):
outputStream = open(dest,"wb")
self.output.write(outputStream)
outputStream.close()
"""
Use as:
gen = GenerateFromTemplate("template.pdf")
gen.addText("Hello!",(100,200))
gen.addText("PDF!",(100,300))
gen.merge()
gen.generate("Output.pdf")
"""
Hope this helps.