Spaces:
Sleeping
Sleeping
# from alessandro | |
import re | |
import cv2 | |
import numpy as np | |
from paddleocr import PaddleOCR | |
from PIL import Image | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
ocr = PaddleOCR(lang='sl') | |
# def convert_to_image(document): | |
# ''' | |
# Function: converts the pdf to image | |
# Input: pdf document | |
# Output: image | |
# ''' | |
# # reads PDFs | |
# # reads only first page of PDF documents | |
# # os.path.join(document.name, 'sample.pdf') | |
# pdf_document = load_from_file(document) | |
# page_1 = pdf_document.create_page(0) | |
# images = renderer.render_page(page_1) | |
# image_data = image.data | |
# # convert the image to numpy array | |
# image = np.array(images) | |
# # handles non-PDF formats (e.g., .tif) | |
# # else: | |
# # images = Image.open(document) | |
# # # convert the image to RGB | |
# # image = images.convert('RGB') | |
# # # convert the image to numpy array | |
# # image = np.array(image) | |
# # # TODO: change to dynamic scaling | |
# # # downscale the image | |
# # scale = 1.494 | |
# # width = int(image.shape[1] / scale) | |
# # height = int(image.shape[0] / scale) | |
# # dim = (width, height) | |
# # image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA) | |
# # fig, ax = plt.subplots(figsize=(15, 10)) | |
# # ax.imshow(image, cmap = 'gray') | |
# return image | |
def deskew(image, model): | |
''' | |
Function: deskew an image | |
Input: takes an image as an array | |
Output: deskewed image | |
''' | |
# map the model classes to the actual degree of skew | |
map = { 0: '-1', 1: '-10', 2: '-11', 3: '-12', 4: '-13', | |
5: '-14',6: '-15', 7: '-2', 8: '-3', 9: '-4', | |
10: '-5',11: '-6',12: '-7', 13: '-8', 14: '-9', | |
15: '0', 16: '1', 17: '10', 18: '11', 19: '12', | |
20: '13',21: '14',22: '15', 23: '180',24: '2', | |
25: '270',26: '3',27: '4', 28: '5', 29: '6', | |
30: '7', 31: '8',32: '9', 33: '90'} | |
image_d = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
width = int(image_d.shape[1] * 0.2) | |
height = int(image_d.shape[0] * 0.2) | |
dim = (width, height) | |
# resize image | |
res = cv2.resize(image_d, dim, interpolation = cv2.INTER_AREA) | |
resized = cv2.resize(res, (200, 200)) | |
# add two dimensions to feed to the model | |
resized = resized.astype('float32').reshape(1, 200, 200 ,1) | |
# normalize | |
resized = resized/255 | |
# predictions | |
predictions = model.run(None, {'conv2d_input': resized}) | |
# best prediction | |
pred = predictions[0].argmax() | |
# angle of skew | |
angle = int(map[pred]) | |
skew_confidence = predictions[0][0][pred] * 100 | |
# deskew original image | |
if angle == 90: | |
deskewed_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE) | |
return deskewed_image, angle, skew_confidence | |
if angle == 270: | |
deskewed_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) | |
return deskewed_image, angle, skew_confidence | |
(h, w) = image.shape[:2] | |
center = (w // 2, h // 2) | |
M = cv2.getRotationMatrix2D(center, -angle, 1.0) | |
deskewed_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, | |
borderMode=cv2.BORDER_REPLICATE) | |
return deskewed_image, angle, skew_confidence | |
def prepare_image_to_autoencoder(image): | |
''' | |
Function: prepare the image to be passed to the autoencoder. | |
Input: image (_type_): deskewed image | |
Output: resized image to be passed to the autoencoder | |
''' | |
height, width = image.shape[:2] | |
target_height = 600 | |
target_width = 600 | |
image = image[int(height/3.6): int(height/1.87), int(width/3.67): int(width/1.575)] | |
# reshape image to fixed size | |
image = cv2.resize(image, (target_width, target_height)) | |
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
# normalize images | |
image = image / 255.0 | |
# reshape to pass image to autoencoder | |
image = image.reshape(target_height, target_width, 1) | |
return image | |
def autoencode_ONNX(image, model): | |
''' | |
Function: remove noise from image | |
Input: image and autoencoder model | |
Output: image | |
''' | |
image = image.astype(np.float32).reshape(1, 600, 600, 1) | |
image = model.run(None, {'input_2': image}) | |
image = image[0] | |
image = image.squeeze() | |
image = image * 255 | |
image = image.astype('uint8') | |
# fig, ax = plt.subplots(figsize=(8, 5)) | |
# ax.imshow(image, cmap = 'gray') | |
return image | |
def detect_entries_ONNX(denoised, model): | |
''' | |
Function: detect boxes Priimek, Ime and Datum boxes | |
Priimek: lastname | |
Ime: firstname | |
Datum smrti: date of death | |
Input: image | |
Output: boxes and confidence scores | |
''' | |
# the object detection model requires a tensor(1, h, w, 3) | |
autoencoded_RGB = cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB) | |
# adds the 1 to the tensor | |
autoencoded_expanded = np.expand_dims(autoencoded_RGB, axis=0) | |
detections = model.run(None, {'input_tensor': autoencoded_expanded}) | |
boxes = detections[1] | |
confidence = detections[4] # returns a ndarray in a list of list | |
boxes = np.array(boxes[0]) | |
confidence = np.array(confidence).reshape(5, 1) | |
boxes_and_confidence = np.append(boxes, confidence, axis=1) | |
# reshapes the boxes to be sorted | |
boxes_and_confidence = boxes_and_confidence.reshape(5, 5) | |
# sorts | |
boxes_and_confidence = \ | |
boxes_and_confidence[boxes_and_confidence[:, 0].argsort()] | |
# boxes (expressed in image %) | |
boxes = boxes_and_confidence[:, :-1] | |
# boxes (expressed in actual pixels: ymin, xmin, ymax, xmax) | |
boxes = boxes * 600 | |
# confidence boxes | |
confidence_boxes = boxes_and_confidence[:, -1].tolist() | |
for box in boxes: | |
ymin, xmin, ymax, xmax = box.astype(int) | |
cv2.rectangle(autoencoded_RGB, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2) | |
plt.figure() | |
plt.imshow(cv2.cvtColor(autoencoded_RGB, cv2.COLOR_BGR2RGB)) | |
plt.title("Detected Boxes") | |
plt.savefig("test.jpg") | |
img = cv2.imread("test.jpg") | |
return Image.fromarray(img), confidence_boxes | |
def extract_detected_entries_pdl(image): | |
result = ocr.ocr(image, cls=False) | |
# boxes = [line[0] for line in result] | |
# txts = [line[1][0] for line in result] | |
# scores = [line[1][1] for line in result] | |
# im_show = draw_ocr(image, boxes, txts, scores, font_path ='/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf') | |
txt = [] | |
scores = [] | |
boxes = [] | |
for r in result[0]: | |
txt.append(cleanString_basic(r[-1][0])) | |
scores.append(r[-1][1]) | |
boxes.append(r[0]) | |
return pd.DataFrame(np.transpose([txt,scores, boxes]),columns = ["Text","Score", "Boundary Box"]) | |
def cleanString_basic(word): | |
word = word.replace("$", "s") | |
return word | |
def clean_string_start(string: 'str'): | |
names_flags = "√" | |
chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/'] | |
if string.startswith(tuple(chars_to_remove)): | |
names_flags = string[0] | |
string = string[1:] | |
return string, names_flags | |
def clean_string_end(string: 'str'): | |
names_flags = "√" | |
chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/'] | |
if string.endswith(tuple(chars_to_remove)): | |
names_flags = string[-1] | |
string = string[:-1] | |
return string, names_flags | |
def clean_dates(date: 'str'): | |
''' | |
Function: cleans the fields "datum smrti" and returns the char removed. | |
Input: date (string format) | |
Output: cleaned frame | |
''' | |
date_flags = "Y" | |
# finds special characters in the string | |
special_char = re.findall(r'[a-zA-Z!\[\|]', date) | |
if len(special_char) > 0: | |
date_flags = special_char | |
# remove special characters in the string | |
string = re.sub(r'[a-zA-Z!\[\|]', '', date) | |
return string, date_flags | |
def regex_string(string): | |
''' | |
Function: swaps the carachters with the "hat" with the regular ones | |
Input: string | |
Output: cleaned string | |
''' | |
map = {'Č': 'C', | |
'č': 'c', | |
'Š': 'S', | |
'š': 's', | |
'Ž': 'Z', | |
'ž':'z'} | |
for x in string: | |
if x in map: | |
string = string.replace(x, map[x]) | |
return string | |
import onnxruntime | |
def pdf_deskew_gr (document): | |
img = convert_to_image(document) | |
model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx") | |
deskewed_image, angle, skew_confidence = deskew(img, model) | |
return deskewed_image, angle, skew_confidence | |
def pdf_clean_gr(document): | |
img = convert_to_image(document) | |
model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx") | |
deskewed_image, angle, skew_confidence = deskew(img, model) | |
img = prepare_image_to_autoencoder(img) | |
model = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx") | |
img = autoencode_ONNX(img, model) | |
return img | |
def pdf_resnet_gr(document): | |
img = convert_to_image(document) | |
model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/CNN_deskew_v0.0.2.onnx") | |
deskewed_image, angle, skew_confidence = deskew(img, model) | |
img = prepare_image_to_autoencoder(img) | |
model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/autoencoder_denoise_v0.0.2.onnx") | |
img = autoencode_ONNX(img, model) | |
model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/ResNet_od_v0.0.2.onnx") | |
boxes, confidence_boxes = detect_entries_ONNX(img, model) | |
return boxes, confidence_boxes | |
def pdf_extract_gr(extractimg): | |
# extractimg = convert_to_image(document) | |
extractimg = np.array(extractimg) | |
model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx") | |
deskewed_image, angle, skew_confidence = deskew(extractimg, model) | |
cleanimg = prepare_image_to_autoencoder(deskewed_image) | |
model = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx") | |
img = autoencode_ONNX(cleanimg, model) | |
# model = onnxruntime.InferenceSession("./models/ResNet_od_v0.0.2.onnx") | |
# boxes, confidence_boxes = detect_entries_ONNX(img, model) | |
# confidence_entries, lastname, firstname, death_date = extract_detected_entries_pdl(img, boxes) | |
df = extract_detected_entries_pdl(img) | |
firstnamerow = df.iloc[0] | |
firstname = firstnamerow[0] | |
firstnameconfidence = round(float(firstnamerow[1]) * 100,3) | |
firstnameconfidence = f"{firstnameconfidence}%" | |
surnamerow = df.iloc[1] | |
surname = surnamerow[0] | |
surnameconfidence = round(float(surnamerow[1]) * 100,3) | |
surnameconfidence = f"{surnameconfidence}%" | |
dodrow = df.iloc[2] | |
dodname = dodrow[0] | |
dodconfidence = round(float(dodrow[1]) * 100,3) | |
dodconfidence = f"{dodconfidence}%" | |
return df, deskewed_image, angle, skew_confidence, img, firstname, firstnameconfidence, surname, surnameconfidence, dodname, dodconfidence | |
css = """ | |
.run_container { | |
display: flex; | |
flex-direction: column; | |
align-items: center; | |
gap: 10px; | |
} | |
.run_btn { | |
margin: auto; | |
width: 50%; | |
display: flex; | |
} | |
.upload_cell { | |
margin: auto; | |
display: flex; | |
} | |
.results_container { | |
display: flex; | |
justify-content: space-evenly; | |
} | |
.results_cell { | |
} | |
""" | |
import gradio as gr | |
with gr.Blocks(css = css) as demo: | |
gr.Markdown(""" | |
# Death Certificate Extraction | |
""", elem_classes = "h1") | |
gr.Markdown("Upload a PDF, extract data") | |
with gr.Box(elem_classes = "run_container"): | |
# ExtractInput = gr.File(label = "Death Certificate", elem_classes="upload_cell") | |
ExtractButton = gr.Button(label = "Extract", elem_classes="run_btn") | |
with gr.Row(elem_id = "hide"): | |
with gr.Column(): | |
ExtractInput = gr.Image() | |
with gr.Column(): | |
# ExtractResult = gr.Image(label = "result") | |
with gr.Row(elem_classes = "results_container"): | |
FirstNameBox = gr.Textbox(label = "First Name", elem_classes = "results_cell") | |
FirstNameConfidenceBox = gr.Textbox(label = "First Name Confidence", elem_classes = "results_cell") | |
with gr.Row(elem_classes = "results_container"): | |
SurnameNameBox = gr.Textbox(label = "Surname", elem_classes = "results_cell") | |
SurnameNameConfidenceBox = gr.Textbox(label = "Surname Confidence", elem_classes = "results_cell") | |
with gr.Row(elem_classes = "results_container"): | |
DODBox = gr.Textbox(label = "Date of Death", elem_classes = "results_cell") | |
DODConfidenceBox = gr.Textbox(label = "Date of Death Confidence", elem_classes = "results_cell") | |
with gr.Accordion("Full Results", open = False): | |
ExtractDF = gr.Dataframe(label = "Results") | |
with gr.Accordion("Clean Image", open = False): | |
CleanOutput = gr.Image() | |
with gr.Accordion("Deskew", open = False): | |
DeskewOutput = gr.Image() | |
with gr.Column(): | |
DeskewAngle = gr.Number(label = "Angle") | |
with gr.Column(): | |
DeskewConfidence = gr.Number(label = "Confidence") | |
ExtractButton.click(fn=pdf_extract_gr, | |
inputs = ExtractInput, | |
outputs = [ExtractDF, DeskewOutput, DeskewAngle, | |
DeskewConfidence, CleanOutput, FirstNameBox, | |
FirstNameConfidenceBox, SurnameNameBox, | |
SurnameNameConfidenceBox, DODBox, DODConfidenceBox]) | |
demo.launch(show_api=True, share=False, debug=True) |