How to find only the bolded text lines from no. of images

Question

from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import os
import pandas as pd
import cv2
import numpy as np

files = os.chdir("C:/Users/abhishek_kumar1/Desktop/New folder")
#print(os.getcwd())
pages = convert_from_path("d.pdf",190,single_file=True,
                      poppler_path='C:/Users/abhishek_kumar1/Downloads/poppler-0.68.0_x86/poppler-0.68.0/bin')
image_counter=1
for page in pages:
    filename = "page_"+str(image_counter)+".jpg"
    page.save(filename,'JPEG')

img = cv2.imread(filename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.imwrite('grey.png',gray)
binary,thresh1 = cv2.threshold(gray, 0, 255,cv2.THRESH_OTSU|cv2.THRESH_BINARY_INV)
cv2.imwrite('Thresh1.png',thresh1)
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 3))
dilation = cv2.dilate(thresh1, rect_kernel, iterations = 6)
contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
im2 = img.copy()


ROI_number = 0
for cnt in contours[::-1]:
    [x,y,w,h] = cv2.boundingRect(cnt)
    ROI=im2[y:y+h, x:x+w]
    #print(str(w),str(h))
    #cv2.putText(im2, str(h), (x,y - 10 ), cv2.FONT_HERSHEY_SIMPLEX, 0.1, (255, 0, 0), 1)
    #cv2.putText(im2, str(w), (x,y + 10 ), cv2.FONT_HERSHEY_SIMPLEX, 0.1, (0, 0, 255), 1)
    cv2.imwrite('ROI_{}.jpg'.format(ROI_number),ROI)
    cv2.rectangle(im2,(x,y),(x+w,y+h),(36,255,12),1)
    ROI_number += 1

cv2.imwrite('contours1.png',im2)

How to find only this image from above code section section, is there any options to understand font type from image like bold, italic,something else
get trouble to find only the bold line part from all of images.
Please any body have a suggestion regarding this please help me out.

Asked By: Abhishek Kumar

||

Source

Answer 1

See python code and result:

import cv2
import numpy as np
img = cv2.imread('C.png')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 160, 255, cv2.THRESH_BINARY)[1]
kernel = np.ones((5,5),np.uint8)
kernel2 = np.ones((3,3),np.uint8)
marker = cv2.dilate(thresh,kernel,iterations = 1)
mask=cv2.erode(thresh,kernel,iterations = 1)

while True:
    tmp=marker.copy()
    marker=cv2.erode(marker, kernel2)
    marker=cv2.max(mask, marker)
    difference = cv2.subtract(tmp, marker)
    if cv2.countNonZero(difference) == 0:
        break

marker_color = cv2.cvtColor(marker, cv2.COLOR_GRAY2BGR)
out=cv2.bitwise_or(img, marker_color)
cv2.imwrite('out.png', out)
cv2.imshow('result', out )

Answered By: Alex Alex

Answer 2

Alex Alex’s answer did not work for me. Here is my alternative described in words.

The general idea is that we compare how many black pixels there are in comparison to the minimum possible pixels to still form characters. This provides us with a difference from the skeleton to normal text and skeleton to bold text. In this way, we can quite clearly separate normal text from the bold text.

Use OCR software to extract bounding boxes of individual words. Optional: Combine individual words into lines of words, for example by word_num in Pytesseract.
Convert the image to grayscale and invert the image colors
Perform Zhang-Suen thinning on the selected area of text on the image (opencv contribution: cv2.ximgproc.thinning)
Sum where there are white pixels in the thinned image, i.e. where values are equal to 255 (white pixels are letters)
Sum where there are white pixels in the inverted image
Finally compute the thickness (sum_inverted_pixels - sum_skeleton_pixels) / sum_skeleton_pixels (sometimes there will be zero division error, check when the sum of the skeleton is 0 and return 0 instead)
Normalize the thickness by minimum and maximum values
Apply a threshold for deciding when a word/line of text is bold, e.g. 0.6 or 0.7

Answered By: Casper Hansen

How to find only the bolded text lines from no. of images

Question:

Answers: