Sharathhebbar24 commited on
Commit
d28ba37
1 Parent(s): 65bf04b

First HF_SPace APp

Browse files
Files changed (12) hide show
  1. .env +10 -0
  2. .gitignore +3 -0
  3. Dockerfile +13 -0
  4. README.md +9 -10
  5. folder_creation.py +12 -0
  6. main.py +65 -0
  7. minio_services.py +36 -0
  8. pdf_to_img.py +26 -0
  9. requirements.txt +6 -0
  10. static/fastapi.png +0 -0
  11. static/minIO.png +0 -0
  12. table_extraction.py +65 -0
.env ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Directories
2
+ IMAGES_DIR = images/
3
+ OUTPUTS_DIR = outputs/
4
+ PDF_DIR = pdfs/
5
+
6
+ # Bucket
7
+ MINIO_KEY = QUK5tI3fsjStPYrCKs18eb3OPTFzPLGeVOLXrsMc
8
+ HOST = localhost:9000
9
+ BUCKET_NAME = table-detection
10
+ ACCESS_KEY = Table extraction
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ junks/
2
+ demo.py
3
+ __pycache__
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim-bullseye
2
+
3
+ WORKDIR /
4
+
5
+ COPY requirements.txt requirements.txt
6
+
7
+ RUN pip install --upgrade pip
8
+ RUN pip install -r requirements.txt
9
+
10
+ COPY . .
11
+ EXPOSE $PORT
12
+
13
+ CMD ["uvicorn", "main:app", "--reload"]
README.md CHANGED
@@ -1,10 +1,9 @@
1
- ---
2
- title: Table Detection
3
- emoji: 🏢
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Table Detection
2
+
3
+ - To run fast api server ```uvicorn main:app --reload```
4
+
5
+ ![FastAPI-Server](static/fastapi.png)
6
+
7
+ - RUN MINIO SERVER: .\minio.exe server C:\minio --console-address :9090
8
+
9
+ ![minIO-server](static/minIO.png)
 
folder_creation.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ def remove_files(dirs):
4
+ for files in os.listdir(dirs):
5
+ os.remove(os.path.join(dirs, files))
6
+
7
+
8
+ def make_directory_if_not_exists(dir_name):
9
+ if not os.path.exists(dir_name):
10
+ os.mkdir(dir_name)
11
+ else:
12
+ remove_files(dir_name)
main.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from pathlib import Path, PurePath
4
+
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from fastapi import FastAPI, Form, HTTPException
7
+ from dotenv import load_dotenv
8
+
9
+ from folder_creation import make_directory_if_not_exists
10
+ from pdf_to_img import pdf_to_image
11
+ from table_extraction import Table_extraction
12
+ from minio_services import MINIO
13
+
14
+ load_dotenv()
15
+
16
+ IMAGES_DIR = os.getenv('IMAGES_DIR')
17
+ PDF_DIR = os.getenv('PDF_DIR')
18
+ OUTPUTS_DIR = os.getenv('OUTPUTS_DIR')
19
+ HOST = os.getenv('HOST')
20
+ ACCESS_KEY = os.getenv('ACCESS_KEY')
21
+ MINIO_KEY = os.getenv('MINIO_KEY')
22
+ BUCKET_NAME = os.getenv('BUCKET_NAME')
23
+ app = FastAPI()
24
+
25
+ origins = ["*"]
26
+
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=origins,
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ @app.post("/")
36
+ async def main(file_name: str = Form(None), uid: str = Form(None)):
37
+ try:
38
+ make_directory_if_not_exists(IMAGES_DIR)
39
+ IMAGES_DIR1 = IMAGES_DIR + uid + '/'
40
+ make_directory_if_not_exists(IMAGES_DIR1)
41
+
42
+ make_directory_if_not_exists(OUTPUTS_DIR)
43
+ OUTPUTS_DIR1 = OUTPUTS_DIR + uid + '/'
44
+ make_directory_if_not_exists(OUTPUTS_DIR1)
45
+ print(file_name)
46
+ if PurePath(file_name).suffix == '.pdf':
47
+ imagename = pdf_to_image(file_name, IMAGES_DIR1)
48
+ else:
49
+ imagename = IMAGES_DIR1+Path(file_name).name
50
+ shutil.copy(file_name, imagename)
51
+
52
+ model = Table_extraction(imagename, OUTPUTS_DIR1)
53
+ op_img = model.get_results()
54
+
55
+ minio = MINIO(HOST, ACCESS_KEY, MINIO_KEY, BUCKET_NAME, uid, op_img)
56
+ minio.upload_to_minio()
57
+ obj = minio.download_from_minio()
58
+ shutil.rmtree(IMAGES_DIR)
59
+ shutil.rmtree(OUTPUTS_DIR)
60
+ return obj
61
+
62
+ except Exception as e:
63
+ shutil.rmtree(IMAGES_DIR)
64
+ shutil.rmtree(OUTPUTS_DIR)
65
+ raise HTTPException(status_code=404, detail=str(e))
minio_services.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from minio import Minio
3
+ from minio.error import (ResponseError, BucketAlreadyOwnedByYou,
4
+ BucketAlreadyExists)
5
+ class MINIO():
6
+ def __init__(self, HOST, ACCESS_KEY, MINIO_KEY, BUCKET_NAME, UID, op):
7
+ self.minioClient = Minio(HOST,
8
+ access_key=ACCESS_KEY,
9
+ secret_key=MINIO_KEY,
10
+ secure=False)
11
+ self.BUCKET_NAME = BUCKET_NAME
12
+ self.UID = UID
13
+ self.op = op
14
+
15
+ def upload_to_minio(self):
16
+ try:
17
+ self.minioClient.make_bucket(self.BUCKET_NAME, location="us-east-1")
18
+ except BucketAlreadyOwnedByYou as err:
19
+ pass
20
+ except BucketAlreadyExists as err:
21
+ pass
22
+ except ResponseError as err:
23
+ raise
24
+
25
+ # Put an object 'A' with contents from 'B'.
26
+ try:
27
+ self.minioClient.fput_object(self.BUCKET_NAME, str(self.UID) + '/' + Path(self.op).name, self.op)
28
+ except ResponseError as err:
29
+ print(err)
30
+
31
+
32
+ def download_from_minio(self):
33
+ val = self.minioClient.fget_object(self.BUCKET_NAME, Path(self.op).name, str(self.UID) + '/' + Path(self.op).name)
34
+ return val.object_name
35
+
36
+
pdf_to_img.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pypdfium2 as pdfium
2
+
3
+
4
+ def pdf_to_image(pdf_name, IMAGES_DIR):
5
+ pdf = pdfium.PdfDocument(pdf_name)
6
+ n_pages = len(pdf)
7
+
8
+ for page_number in range(n_pages):
9
+ page = pdf.get_page(page_number)
10
+
11
+ scale_value = 3
12
+ pil_image = page.render_to(
13
+ pdfium.BitmapConv.pil_image,
14
+ scale = scale_value,
15
+ rotation = 0,
16
+ fill_colour=(255, 255, 255, 255),
17
+ crop=(0, 0, 0, 0),
18
+ greyscale=False,
19
+ optimise_mode=pdfium.OptimiseMode.NONE,)
20
+
21
+ imagename = IMAGES_DIR + str(page_number + 1) + ".png"
22
+ print(imagename)
23
+ pil_image.save(imagename)
24
+ return imagename
25
+
26
+ # pdf_to_image('junks\\Attention is all u need.pdf')
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ultralyticsplus==0.0.23
2
+ ultralytics==8.0.21
3
+ transformers
4
+ pypdfium2==3.15.0
5
+ ak-minio==5.0.7.post2
6
+ fastapi[all]
static/fastapi.png ADDED
static/minIO.png ADDED
table_extraction.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ultralyticsplus import YOLO, render_result
2
+ from pathlib import Path, PurePath
3
+
4
+ class Table_extraction():
5
+
6
+ def __init__(self, image, OUTPUTS_DIR):
7
+ self.model = YOLO('keremberke/yolov8m-table-extraction')
8
+ self.model.overrides['conf'] = 0.25
9
+ self.model.overrides['iou'] = 0.45
10
+ self.model.overrides['agnostic_nms'] = False
11
+ self.model.overrides['max_det'] = 1000
12
+ self.image = image
13
+ self.OUTPUTS_DIR = OUTPUTS_DIR
14
+
15
+
16
+ def get_results(self):
17
+ self.results = self.model(self.image)
18
+ render = render_result(model=self.model, image=self.image, result=self.results[0])
19
+ op_img = self.OUTPUTS_DIR + Path(self.image).name
20
+ render.save(op_img)
21
+ return op_img
22
+
23
+ # def recognize_coords(self):
24
+ # final_results = []
25
+ # result = str(self.results)
26
+ # print(result)
27
+ # result = result.split('(')
28
+ # print('Result', result)
29
+ # if '\n' not in result[1]:
30
+ # print('In if')
31
+ # coords = []
32
+ # result = result[1][2: -3].split(',')
33
+ # for i in result:
34
+ # coords.append(float(i))
35
+ # final_results.append(coords)
36
+ # return final_results
37
+ # else:
38
+ # result = result[1:][0]
39
+ # result = result.split('\n')
40
+ # j = 0
41
+ # print('Results: ', result)
42
+ # for i in result:
43
+ # coords = []
44
+ # i = i.strip()
45
+ # if j == 0:
46
+ # print(i)
47
+ # i = i[2:-2]
48
+ # print(i)
49
+ # elif j == len(result) - 1:
50
+ # i = i[1:-3]
51
+ # else:
52
+ # i = i[1:-2]
53
+ # j+=1
54
+ # print(i)
55
+ # i = i.split(',')
56
+ # print(i)
57
+ # for k in i:
58
+ # coords.append(float(k))
59
+ # final_results.append(coords)
60
+ # return final_results
61
+
62
+
63
+ # te = Table_extraction('junks\9.png', 'outputs/123')
64
+ # te.get_results()
65
+ # print(te.recognize_coords())