rishiraj commited on
Commit
b288805
1 Parent(s): 5f95db3

Create pdf/pdf_extractor.py

Browse files
Files changed (1) hide show
  1. pdf/pdf_extractor.py +52 -0
pdf/pdf_extractor.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union, Optional
2
+ import json
3
+ from indexify_extractor_sdk import Content, Extractor, Feature
4
+ from pydantic import BaseModel, Field
5
+ from .utils.tt_module import get_tables
6
+ import fitz
7
+ import tempfile
8
+
9
+ class PDFExtractorConfig(BaseModel):
10
+ output_types: List[str] = Field(default_factory=lambda: ["text", "image", "table"])
11
+
12
+ class PDFExtractor(Extractor):
13
+ name = "tensorlake/pdf-extractor"
14
+ description = "PDF Extractor for Texts, Images & Tables"
15
+ system_dependencies = ["poppler-utils"]
16
+ input_mime_types = ["application/pdf"]
17
+
18
+ def __init__(self):
19
+ super(PDFExtractor, self).__init__()
20
+
21
+ def extract(self, content: Content, params: PDFExtractorConfig) -> List[Union[Feature, Content]]:
22
+ contents = []
23
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile:
24
+ inputtmpfile.write(content.data)
25
+ inputtmpfile.flush()
26
+ doc = fitz.open(inputtmpfile.name)
27
+
28
+ for i in range(len(doc)):
29
+ page = doc[i]
30
+
31
+ if "text" in params.output_types:
32
+ page_text = page.get_text()
33
+ feature = Feature.metadata(value={"type": "text", "page": i+1})
34
+ contents.append(Content.from_text(page_text, features=[feature]))
35
+
36
+ if "image" in params.output_types:
37
+ image_list = page.get_images()
38
+ for img in image_list:
39
+ xref = img[0]
40
+ pix = fitz.Pixmap(doc, xref)
41
+ if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name):
42
+ pix = fitz.Pixmap(fitz.csRGB, pix)
43
+ feature = Feature.metadata({"type": "image", "page": i+1})
44
+ contents.append(Content(content_type="image/png", data=pix.tobytes(), features=[feature]))
45
+
46
+ if "table" in params.output_types:
47
+ tables = get_tables(content.data)
48
+ for page, content in tables.items():
49
+ feature = Feature.metadata({"type": "table", "page": int(page)})
50
+ contents.append(Content(content_type="application/json", data=json.dumps(content), features=[feature]))
51
+
52
+ return contents