highlight text using python-docx

Question:

I want to highlight text in docx and save it another file.
here is my code

from docx import Document

def highlight_text(filename):

    doc = Document(filename)
    for p in doc.paragraphs:
        if 'vehicle' in p.text:
            inline = p.runs
            # print(inline)
            # Loop added to work with runs (strings with same style)
            for i in range(len(inline)):
                # print((inline[i].text).encode('ascii'))
                if 'vehicle' in inline[i].text:
                    x=inline[i].text.split('vehicle')
                    inline[i].clear()
                    for j in range(len(x)-1):
                        inline[i].add_text(x[j])
                        y=inline[i].add_text('vehicle')
                        y.highlight_color='YELLOW'
            # print (p.text)

    doc.save('t2.docx')
    return 1
if __name__ == '__main__':

    highlight_text('t1.docx')

word is not getting highlighted what i am doing wrong.

Asked By: Vivek Singh

||

Answers:

Highlighting is an attribute of a font, not a run directly. Also, Run.add_text() returns a _Text object, not a run.

from docx.enum.text import WD_COLOR_INDEX

for paragraph in document.paragraphs:
    if 'vehicle' in paragraph.text:
        for run in paragraph.runs:
            if 'vehicle' in run.text:
                x = run.text.split('vehicle')
                run.clear()
                for i in range(len(x)-1):
                    run.add_text(x[i])
                    run.add_text('vehicle')
                    run.font.highlight_color = WD_COLOR_INDEX.YELLOW

Also, a highlight is applied to the entire run, so you need to create a separate runs for each of the text before “vehicle”, the “vehicle” word itself, and the text after “vehicle”.

Also, there’s no guarantee that a given word appears completely within a single run; runs often split within a word. So you’ll need to be more sophisticated in your approach to handle the general case.

So there’s quite a bit more work to do here, but this should get you seeing at least some yellow highlighting 🙂

Answered By: scanny

this is my solution to this problem. It works with multiple tokens or sequence of characters. First we need to split the runs into multiple runs in my case a did 4 runs with reguex in order to highlight and comment a token. Then the function to highlight and comment .

import docx
import re
from docx.enum.text import WD_COLOR_INDEX
doc=docx.Document(yourwordPath)
def split_text(text, word):
    pattern = re.compile(r'([Ss]*)(b{})([Ss]*)'.format(word))
    match = pattern.search(text)
    if match:
        return match.groups()
    return None
def split_Runs(doc,word):
    for p in doc.paragraphs:
        if p.text.find(word) != -1:
            virtualRuns=p.runs
            p.text = ""
            for r in virtualRuns:
                if r.text.find(word) != -1:
                    before, word, after = split_text(r.text, word)
                    p.add_run(before)
                    p.add_run()
                    p.add_run(word)
                    p.add_run(after)
                else:
                    p.add_run(r.text)
    return doc
    
def style_Token(doc,word,comment=True):
    for p in doc.paragraphs:
        for i,r in enumerate(p.runs):
            if p.runs[i].text.find(word) != -1:
                p.runs[i].font.highlight_color = WD_COLOR_INDEX.YELLOW
                if comment:
                    p.runs[i-1].add_comment(f'{word} No se encuentra en el documento',author='BOT CONFRONT')
                    #r.add_comment(f'{word} No se encuentra en el documento',author='BOT CONFRONT')
    return doc
#nums is the list of tokens that is going to be highlighted and comment
nums=['10231244','48023851','20104802385']
for num in nums:
    doc=split_Runs(doc,num)
for num in nums:
    doc=style_Token(doc,num,True)
doc.save(yourwordPath)

Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.