deathCertReader / app.py
Alealejandrooo's picture
Fixing the dimensions in extract_detected_entries_pdl()
922a160
raw
history blame
10.1 kB
import re
import cv2
import numpy as np
from paddleocr import PaddleOCR
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
import onnxruntime
import gradio as gr
# initialize the OCR
ocr = PaddleOCR(lang='sl',
enable_mkldnn=True,
cls=False,
show_log= False)
# initialize the models
model_deskew = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx")
model_denoise = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx")
##### All Functions #####
def preprocess_image(image):
'''
Function: preprocess image to make it lighter to work on
Input: resized image
Output: image
'''
image = np.array(image)
scale = 1.494
width = int(image.shape[1] / scale)
height = int(image.shape[0] / scale)
dim = (width, height)
image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
return image
def deskew(image, model):
'''
Function: deskew an image
Input: takes an image as an array
Output: deskewed image
'''
# map the model classes to the actual degree of skew
map = { 0: '-1', 1: '-10', 2: '-11', 3: '-12', 4: '-13',
5: '-14',6: '-15', 7: '-2', 8: '-3', 9: '-4',
10: '-5',11: '-6',12: '-7', 13: '-8', 14: '-9',
15: '0', 16: '1', 17: '10', 18: '11', 19: '12',
20: '13',21: '14',22: '15', 23: '180',24: '2',
25: '270',26: '3',27: '4', 28: '5', 29: '6',
30: '7', 31: '8',32: '9', 33: '90'}
image_d = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
width = int(image_d.shape[1] * 0.2)
height = int(image_d.shape[0] * 0.2)
dim = (width, height)
# resize image
res = cv2.resize(image_d, dim, interpolation = cv2.INTER_AREA)
resized = cv2.resize(res, (200, 200))
# add two dimensions to feed to the model
resized = resized.astype('float32').reshape(1, 200, 200 ,1)
# normalize
resized = resized/255
# predictions
predictions = model.run(None, {'conv2d_input': resized})
# best prediction
pred = predictions[0].argmax()
# angle of skew
angle = int(map[pred])
skew_confidence = predictions[0][0][pred] * 100
# deskew original image
if angle == 90:
deskewed_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
return deskewed_image, angle, skew_confidence
if angle == 270:
deskewed_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
return deskewed_image, angle, skew_confidence
(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, -angle, 1.0)
deskewed_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE)
return deskewed_image, angle, skew_confidence
def prepare_image_to_autoencoder(image):
'''
Function: prepare the image to be passed to the autoencoder.
Input: image (_type_): deskewed image
Output: resized image to be passed to the autoencoder
'''
height, width = image.shape[:2]
target_height = 600
target_width = 600
image = image[int(height/3.6): int(height/1.87), int(width/3.67): int(width/1.575)]
# reshape image to fixed size
image = cv2.resize(image, (target_width, target_height))
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# normalize images
image = image / 255.0
# reshape to pass image to autoencoder
image = image.reshape(target_height, target_width, 1)
return image
def autoencode_ONNX(image, model):
'''
Function: remove noise from image
Input: image and autoencoder model
Output: image
'''
image = image.astype(np.float32).reshape(1, 600, 600, 1)
image = model.run(None, {'input_2': image})
image = image[0]
image = image.squeeze()
image = image * 255
image = image.astype('uint8')
return image
def extract_detected_entries_pdl(image):
"""
Extracts text, scores, and boundary boxes from an image using OCR and returns a DataFrame.
This function takes an input image, applies OCR to detect text in the image, and then extracts
the detected text, confidence scores, and boundary boxes for each text entry. The extracted
information is returned in a DataFrame with columns "Text", "Score", and "Boundary Box".
Parameters
----------
image : numpy.ndarray
The input image to be processed.
Returns
-------
pandas.DataFrame
A DataFrame containing the extracted text, confidence scores, and boundary boxes
for each detected text entry. The DataFrame has the following columns:
- "Text": The detected text.
- "Score": The confidence score for the detected text.
- "Boundary Box": The coordinates of the boundary box for the detected text.
"""
# run the OCR
result = ocr.ocr(image)
# creates Pandas Dataframe
txt = []
scores = []
boxes = []
for r in result[0]:
txt.append(cleanString_basic(r[-1][0]))
scores.append(r[-1][1])
boxes.append(tuple(map(tuple, r[0])))
return pd.DataFrame({"Text": txt, "Score": scores, "Boundary Box": boxes})
def cleanString_basic(word):
word = word.replace("$", "s")
return word
def clean_string_start(string: 'str'):
names_flags = "√"
chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/']
if string.startswith(tuple(chars_to_remove)):
names_flags = string[0]
string = string[1:]
return string, names_flags
def clean_string_end(string: 'str'):
names_flags = "√"
chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/']
if string.endswith(tuple(chars_to_remove)):
names_flags = string[-1]
string = string[:-1]
return string, names_flags
def clean_dates(date: 'str'):
'''
Function: cleans the fields "datum smrti" and returns the char removed.
Input: date (string format)
Output: cleaned frame
'''
date_flags = "Y"
# finds special characters in the string
special_char = re.findall(r'[a-zA-Z!\[\|]', date)
if len(special_char) > 0:
date_flags = special_char
# remove special characters in the string
string = re.sub(r'[a-zA-Z!\[\|]', '', date)
return string, date_flags
##### Main Function #####
def pdf_extract_gr(image):
extractimg = preprocess_image(image)
#extractimg = np.array(image)
# deskew the image
deskewed_image, angle, skew_confidence = deskew(extractimg, model_deskew)
# prepare the image for the autoencoder
cleanimg = prepare_image_to_autoencoder(deskewed_image)
# clean the image
img = autoencode_ONNX(cleanimg, model_denoise)
# extract the entries from the image
df = extract_detected_entries_pdl(img)
# first name
firstnamerow = df.iloc[0]
firstname = firstnamerow[0]
firstnameconfidence = round(float(firstnamerow[1]) * 100,3)
firstnameconfidence = f"{firstnameconfidence}%"
# surname
surnamerow = df.iloc[1]
surname = surnamerow[0]
surnameconfidence = round(float(surnamerow[1]) * 100,3)
surnameconfidence = f"{surnameconfidence}%"
# death date condifence
dodrow = df.iloc[2]
dodname = dodrow[0]
dodconfidence = round(float(dodrow[1]) * 100,3)
dodconfidence = f"{dodconfidence}%"
# return all the results
return df, deskewed_image, angle, skew_confidence, img, firstname, firstnameconfidence, surname, surnameconfidence, dodname, dodconfidence
##### Gradio Style #####
css = """
.run_container {
display: flex;
flex-direction: column;
align-items: center;
gap: 10px;
}
.run_btn {
margin: auto;
width: 50%;
display: flex;
}
.upload_cell {
margin: auto;
display: flex;
}
.results_container {
display: flex;
justify-content: space-evenly;
}
.results_cell {
}
"""
##### Gradio Blocks #####
with gr.Blocks(css = css) as demo:
gr.Markdown("""
# Death Certificate Extraction
""", elem_classes = "h1")
gr.Markdown("Upload a PDF, extract data")
with gr.Box(elem_classes = "run_container"):
# ExtractInput = gr.File(label = "Death Certificate", elem_classes="upload_cell")
ExtractButton = gr.Button(label = "Extract", elem_classes="run_btn")
with gr.Row(elem_id = "hide"):
with gr.Column():
ExtractInput = gr.Image()
with gr.Column():
# ExtractResult = gr.Image(label = "result")
with gr.Row(elem_classes = "results_container"):
FirstNameBox = gr.Textbox(label = "First Name", elem_classes = "results_cell")
FirstNameConfidenceBox = gr.Textbox(label = "First Name Confidence", elem_classes = "results_cell")
with gr.Row(elem_classes = "results_container"):
SurnameNameBox = gr.Textbox(label = "Surname", elem_classes = "results_cell")
SurnameNameConfidenceBox = gr.Textbox(label = "Surname Confidence", elem_classes = "results_cell")
with gr.Row(elem_classes = "results_container"):
DODBox = gr.Textbox(label = "Date of Death", elem_classes = "results_cell")
DODConfidenceBox = gr.Textbox(label = "Date of Death Confidence", elem_classes = "results_cell")
with gr.Accordion("Full Results", open = False):
ExtractDF = gr.Dataframe(label = "Results")
with gr.Accordion("Clean Image", open = False):
CleanOutput = gr.Image()
with gr.Accordion("Deskew", open = False):
DeskewOutput = gr.Image()
with gr.Column():
DeskewAngle = gr.Number(label = "Angle")
with gr.Column():
DeskewConfidence = gr.Number(label = "Confidence")
ExtractButton.click(fn=pdf_extract_gr,
inputs = ExtractInput,
outputs = [ExtractDF, DeskewOutput, DeskewAngle,
DeskewConfidence, CleanOutput, FirstNameBox,
FirstNameConfidenceBox, SurnameNameBox,
SurnameNameConfidenceBox, DODBox, DODConfidenceBox])
demo.launch(show_api=True, share=False, debug=True)