Liam Dyer commited on
Commit
b697ac0
1 Parent(s): 3bf066d

only OCR when necessary

Browse files
Files changed (1) hide show
  1. app.py +19 -11
app.py CHANGED
@@ -4,6 +4,16 @@ from pypdf import PdfReader
4
  import ocrmypdf
5
 
6
 
 
 
 
 
 
 
 
 
 
 
7
  @spaces.GPU
8
  def convert(pdf_file):
9
  reader = PdfReader(pdf_file)
@@ -17,26 +27,24 @@ def convert(pdf_file):
17
  "title": reader.metadata.title,
18
  }
19
 
 
 
 
20
  # Check if there are any images
21
  image_count = 0
22
  for page in reader.pages:
23
  image_count += len(page.images)
24
 
25
- # If there are images, perform OCR on the document
26
- if image_count > 0:
27
  out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
28
  ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
29
- pdf_file = out_pdf_file
30
- reader = PdfReader(pdf_file)
31
 
32
- # Extract text
33
- full_text = ""
34
- for idx, page in enumerate(reader.pages):
35
- text = page.extract_text()
36
- if len(text) > 0:
37
- full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
38
 
39
- return full_text.strip(), metadata
40
 
41
 
42
  gr.Interface(
 
4
  import ocrmypdf
5
 
6
 
7
+ def extract_text_from_pdf(reader):
8
+ full_text = ""
9
+ for idx, page in enumerate(reader.pages):
10
+ text = page.extract_text()
11
+ if len(text) > 0:
12
+ full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
13
+
14
+ return full_text.strip()
15
+
16
+
17
  @spaces.GPU
18
  def convert(pdf_file):
19
  reader = PdfReader(pdf_file)
 
27
  "title": reader.metadata.title,
28
  }
29
 
30
+ # Extract text
31
+ full_text = extract_text_from_pdf(reader)
32
+
33
  # Check if there are any images
34
  image_count = 0
35
  for page in reader.pages:
36
  image_count += len(page.images)
37
 
38
+ # If there are images and not much content, perform OCR on the document
39
+ if image_count > 0 and len(full_text) < 1000:
40
  out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
41
  ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
 
 
42
 
43
+ # Re-extract text
44
+ reader = PdfReader(pdf_file)
45
+ full_text = extract_text_from_pdf(reader)
 
 
 
46
 
47
+ return full_text, metadata
48
 
49
 
50
  gr.Interface(