Spaces:

huggingchat
/

pdf-to-markdown

Running on Zero

Liam Dyer commited on May 22

Commit

b697ac0

•

1 Parent(s): 3bf066d

only OCR when necessary

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,6 +4,16 @@ from pypdf import PdfReader
 import ocrmypdf
 @spaces.GPU
 def convert(pdf_file):
     reader = PdfReader(pdf_file)
@@ -17,26 +27,24 @@ def convert(pdf_file):
         "title": reader.metadata.title,
     }
     # Check if there are any images
     image_count = 0
     for page in reader.pages:
         image_count += len(page.images)
-    # If there are images, perform OCR on the document
-    if image_count > 0:
         out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
         ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
-        pdf_file = out_pdf_file
-        reader = PdfReader(pdf_file)
-    # Extract text
-    full_text = ""
-    for idx, page in enumerate(reader.pages):
-        text = page.extract_text()
-        if len(text) > 0:
-            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
-    return full_text.strip(), metadata
 gr.Interface(

 import ocrmypdf
+def extract_text_from_pdf(reader):
+    full_text = ""
+    for idx, page in enumerate(reader.pages):
+        text = page.extract_text()
+        if len(text) > 0:
+            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
+    return full_text.strip()
 @spaces.GPU
 def convert(pdf_file):
     reader = PdfReader(pdf_file)
         "title": reader.metadata.title,
     }
+    # Extract text
+    full_text = extract_text_from_pdf(reader)
     # Check if there are any images
     image_count = 0
     for page in reader.pages:
         image_count += len(page.images)
+    # If there are images and not much content, perform OCR on the document
+    if image_count > 0 and len(full_text) < 1000:
         out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
         ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
+        # Re-extract text
+        reader = PdfReader(pdf_file)
+        full_text = extract_text_from_pdf(reader)
+    return full_text, metadata
 gr.Interface(