Spaces:
Sleeping
Sleeping
Create pdf-extractor/pdf_extractor.py
Browse files
pdf-extractor/pdf_extractor.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Union, Optional
|
2 |
+
import json
|
3 |
+
from indexify_extractor_sdk import Content, Extractor, Feature
|
4 |
+
from pydantic import BaseModel, Field
|
5 |
+
from .utils.tt_module import get_tables
|
6 |
+
import fitz
|
7 |
+
import tempfile
|
8 |
+
|
9 |
+
class PDFExtractorConfig(BaseModel):
|
10 |
+
output_types: List[str] = Field(default_factory=lambda: ["text", "image", "table"])
|
11 |
+
|
12 |
+
class PDFExtractor(Extractor):
|
13 |
+
name = "tensorlake/pdf-extractor"
|
14 |
+
description = "PDF Extractor for Texts, Images & Tables"
|
15 |
+
system_dependencies = ["poppler-utils"]
|
16 |
+
input_mime_types = ["application/pdf"]
|
17 |
+
|
18 |
+
def __init__(self):
|
19 |
+
super(PDFExtractor, self).__init__()
|
20 |
+
|
21 |
+
def extract(self, content: Content, params: PDFExtractorConfig) -> List[Union[Feature, Content]]:
|
22 |
+
contents = []
|
23 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile:
|
24 |
+
inputtmpfile.write(content.data)
|
25 |
+
inputtmpfile.flush()
|
26 |
+
doc = fitz.open(inputtmpfile.name)
|
27 |
+
|
28 |
+
for i in range(len(doc)):
|
29 |
+
page = doc[i]
|
30 |
+
|
31 |
+
if "text" in params.output_types:
|
32 |
+
page_text = page.get_text()
|
33 |
+
feature = Feature.metadata(value={"type": "text", "page": i+1})
|
34 |
+
contents.append(Content.from_text(page_text, features=[feature]))
|
35 |
+
|
36 |
+
if "image" in params.output_types:
|
37 |
+
image_list = page.get_images()
|
38 |
+
for img in image_list:
|
39 |
+
xref = img[0]
|
40 |
+
pix = fitz.Pixmap(doc, xref)
|
41 |
+
if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name):
|
42 |
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
43 |
+
feature = Feature.metadata({"type": "image", "page": i+1})
|
44 |
+
contents.append(Content(content_type="image/png", data=pix.tobytes(), features=[feature]))
|
45 |
+
|
46 |
+
if "table" in params.output_types:
|
47 |
+
tables = get_tables(content.data)
|
48 |
+
for page, content in tables.items():
|
49 |
+
feature = Feature.metadata({"type": "table", "page": int(page)})
|
50 |
+
contents.append(Content(content_type="application/json", data=json.dumps(content), features=[feature]))
|
51 |
+
|
52 |
+
return contents
|