Finding and Isolating Marginalia

Finding and Isolating Marginalia

import pytesseract
import cv2


image = cv2.imread('data/sample_mgh_2.jpg')
base_image = image.copy()

gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (7,7), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Create rectangular structuring element and dilate
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,25))
dilate = cv2.dilate(thresh, kernel, iterations=1)

# Find contours and draw rectangle
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
main_text = ""
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    if h > 200 and w > 250:
        roi = base_image[y:y+h, 0:x]
#         cv2.rectangle(image, (0, y), (x, 0 + h+20), (36,255,12), 2)
        
        constant= cv2.copyMakeBorder(roi.copy(),30,30,30,30,cv2.BORDER_CONSTANT,value=[255,255,255])
        ocr_result = pytesseract.image_to_string(constant)
        cv2.imwrite("temp/output.png", roi)
        
        print (ocr_result)
#         print (ocr_result)
# cv2.imwrite("temp/output.png", image)
786
Ez, 13, 5.
“£117,

Ez. 8, 18. 19.

Toh. 10, 11.

Tudae 12.
Matth. 25, 21.

1, Petr. 5,3.

"fay.

ef, Luc, 12,
35.

“col. 578,

Matth.23, 27,
Matth. 6, 1.

ocr_result = pytesseract.image_to_string(img)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-6f996add95e2> in <module>
----> 1 ocr_result = pytesseract.image_to_string(img)

NameError: name 'img' is not defined
print (ocr_result)
lines = ocr_result.split("\n\n")
for line in lines:
    temp_line = line.replace(",", "")
    if temp_line.isdigit():
        pass
    else:
        components = []
        segs = line.split(",")
        for seg in segs:
            seg = seg.strip()
            num = False
            for character in seg:
                if character.isdigit():
                    num = True
            if num == False:
                components.append(seg)
        print (components)