highlight text using python-docx
Question:
I want to highlight text in docx and save it another file.
here is my code
from docx import Document
def highlight_text(filename):
doc = Document(filename)
for p in doc.paragraphs:
if 'vehicle' in p.text:
inline = p.runs
# print(inline)
# Loop added to work with runs (strings with same style)
for i in range(len(inline)):
# print((inline[i].text).encode('ascii'))
if 'vehicle' in inline[i].text:
x=inline[i].text.split('vehicle')
inline[i].clear()
for j in range(len(x)-1):
inline[i].add_text(x[j])
y=inline[i].add_text('vehicle')
y.highlight_color='YELLOW'
# print (p.text)
doc.save('t2.docx')
return 1
if __name__ == '__main__':
highlight_text('t1.docx')
word is not getting highlighted what i am doing wrong.
Answers:
Highlighting is an attribute of a font, not a run directly. Also, Run.add_text()
returns a _Text
object, not a run.
from docx.enum.text import WD_COLOR_INDEX
for paragraph in document.paragraphs:
if 'vehicle' in paragraph.text:
for run in paragraph.runs:
if 'vehicle' in run.text:
x = run.text.split('vehicle')
run.clear()
for i in range(len(x)-1):
run.add_text(x[i])
run.add_text('vehicle')
run.font.highlight_color = WD_COLOR_INDEX.YELLOW
Also, a highlight is applied to the entire run, so you need to create a separate runs for each of the text before “vehicle”, the “vehicle” word itself, and the text after “vehicle”.
Also, there’s no guarantee that a given word appears completely within a single run; runs often split within a word. So you’ll need to be more sophisticated in your approach to handle the general case.
So there’s quite a bit more work to do here, but this should get you seeing at least some yellow highlighting 🙂
this is my solution to this problem. It works with multiple tokens or sequence of characters. First we need to split the runs into multiple runs in my case a did 4 runs with reguex in order to highlight and comment a token. Then the function to highlight and comment .
import docx
import re
from docx.enum.text import WD_COLOR_INDEX
doc=docx.Document(yourwordPath)
def split_text(text, word):
pattern = re.compile(r'([Ss]*)(b{})([Ss]*)'.format(word))
match = pattern.search(text)
if match:
return match.groups()
return None
def split_Runs(doc,word):
for p in doc.paragraphs:
if p.text.find(word) != -1:
virtualRuns=p.runs
p.text = ""
for r in virtualRuns:
if r.text.find(word) != -1:
before, word, after = split_text(r.text, word)
p.add_run(before)
p.add_run()
p.add_run(word)
p.add_run(after)
else:
p.add_run(r.text)
return doc
def style_Token(doc,word,comment=True):
for p in doc.paragraphs:
for i,r in enumerate(p.runs):
if p.runs[i].text.find(word) != -1:
p.runs[i].font.highlight_color = WD_COLOR_INDEX.YELLOW
if comment:
p.runs[i-1].add_comment(f'{word} No se encuentra en el documento',author='BOT CONFRONT')
#r.add_comment(f'{word} No se encuentra en el documento',author='BOT CONFRONT')
return doc
#nums is the list of tokens that is going to be highlighted and comment
nums=['10231244','48023851','20104802385']
for num in nums:
doc=split_Runs(doc,num)
for num in nums:
doc=style_Token(doc,num,True)
doc.save(yourwordPath)
I want to highlight text in docx and save it another file.
here is my code
from docx import Document
def highlight_text(filename):
doc = Document(filename)
for p in doc.paragraphs:
if 'vehicle' in p.text:
inline = p.runs
# print(inline)
# Loop added to work with runs (strings with same style)
for i in range(len(inline)):
# print((inline[i].text).encode('ascii'))
if 'vehicle' in inline[i].text:
x=inline[i].text.split('vehicle')
inline[i].clear()
for j in range(len(x)-1):
inline[i].add_text(x[j])
y=inline[i].add_text('vehicle')
y.highlight_color='YELLOW'
# print (p.text)
doc.save('t2.docx')
return 1
if __name__ == '__main__':
highlight_text('t1.docx')
word is not getting highlighted what i am doing wrong.
Highlighting is an attribute of a font, not a run directly. Also, Run.add_text()
returns a _Text
object, not a run.
from docx.enum.text import WD_COLOR_INDEX
for paragraph in document.paragraphs:
if 'vehicle' in paragraph.text:
for run in paragraph.runs:
if 'vehicle' in run.text:
x = run.text.split('vehicle')
run.clear()
for i in range(len(x)-1):
run.add_text(x[i])
run.add_text('vehicle')
run.font.highlight_color = WD_COLOR_INDEX.YELLOW
Also, a highlight is applied to the entire run, so you need to create a separate runs for each of the text before “vehicle”, the “vehicle” word itself, and the text after “vehicle”.
Also, there’s no guarantee that a given word appears completely within a single run; runs often split within a word. So you’ll need to be more sophisticated in your approach to handle the general case.
So there’s quite a bit more work to do here, but this should get you seeing at least some yellow highlighting 🙂
this is my solution to this problem. It works with multiple tokens or sequence of characters. First we need to split the runs into multiple runs in my case a did 4 runs with reguex in order to highlight and comment a token. Then the function to highlight and comment .
import docx
import re
from docx.enum.text import WD_COLOR_INDEX
doc=docx.Document(yourwordPath)
def split_text(text, word):
pattern = re.compile(r'([Ss]*)(b{})([Ss]*)'.format(word))
match = pattern.search(text)
if match:
return match.groups()
return None
def split_Runs(doc,word):
for p in doc.paragraphs:
if p.text.find(word) != -1:
virtualRuns=p.runs
p.text = ""
for r in virtualRuns:
if r.text.find(word) != -1:
before, word, after = split_text(r.text, word)
p.add_run(before)
p.add_run()
p.add_run(word)
p.add_run(after)
else:
p.add_run(r.text)
return doc
def style_Token(doc,word,comment=True):
for p in doc.paragraphs:
for i,r in enumerate(p.runs):
if p.runs[i].text.find(word) != -1:
p.runs[i].font.highlight_color = WD_COLOR_INDEX.YELLOW
if comment:
p.runs[i-1].add_comment(f'{word} No se encuentra en el documento',author='BOT CONFRONT')
#r.add_comment(f'{word} No se encuentra en el documento',author='BOT CONFRONT')
return doc
#nums is the list of tokens that is going to be highlighted and comment
nums=['10231244','48023851','20104802385']
for num in nums:
doc=split_Runs(doc,num)
for num in nums:
doc=style_Token(doc,num,True)
doc.save(yourwordPath)