pix2struct

Runtime error

App Files Files Community

qtoino commited on Apr 3

Commit

7974bc5

•

1 Parent(s): 9b65e39

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -67

app.py CHANGED Viewed

@@ -1,75 +1,30 @@
 import gradio as gr
-import requests
-from PIL import Image
 from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
-import spaces
-@spaces.GPU
-def infer_infographics(image, question):
-    model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-ai2d-base")
-    processor = Pix2StructProcessor.from_pretrained("google/pix2struct-ai2d-base")
-    inputs = processor(images=image, text=question, return_tensors="pt")
     predictions = model.generate(**inputs)
     return processor.decode(predictions[0], skip_special_tokens=True)
-@spaces.GPU
-def infer_ui(image, question):
-    model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-screen2words-base")
-    processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-base")
-    inputs = processor(images=image,text=question, return_tensors="pt")
-    predictions = model.generate(**inputs)
-    return processor.decode(predictions[0], skip_special_tokens=True)
-@spaces.GPU
-def infer_chart(image, question):
-  model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-chartqa-base")
-  processor = Pix2StructProcessor.from_pretrained("google/pix2struct-chartqa-base")
-  inputs = processor(images=image, text=question, return_tensors="pt")
-  predictions = model.generate(**inputs)
-  return processor.decode(predictions[0], skip_special_tokens=True)
-@spaces.GPU
-def infer_doc(image, question):
-  model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-base")
-  processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-base")
-  inputs = processor(images=image, text=question, return_tensors="pt")
-  predictions = model.generate(**inputs)
-  return processor.decode(predictions[0], skip_special_tokens=True)
-css = """
-  #mkd {
-    height: 500px;
-    overflow: auto;
-    border: 1px solid #ccc;
-  }
-"""
-with gr.Blocks(css=css) as demo:
-    gr.HTML("<h1><center>Pix2Struct 📄<center><h1>")
-    gr.HTML("<h3><center>Pix2Struct is a powerful backbone for visual question answering. ⚡</h3>")
-    gr.HTML("<h3><center>This app has base version of the model. For better performance, use large checkpoints.<h3>")
-    with gr.Row():
-      with gr.Column():
-        input_img = gr.Image(label="Input Document")
-        question = gr.Text(label="Question")
-        submit_btn = gr.Button(value="Submit")
-      output = gr.Text(label="Answer")
-    gr.Examples(
-    [["docvqa_example.png", "How many items are sold?"]],
-    inputs = [input_img, question],
-    outputs = [output],
-    fn=infer_doc,
-    cache_examples=True,
-    label='Click on any Examples below to get Document Question Answering results quickly 👇'
-    )
-    submit_btn.click(infer_doc, [input_img, question], [output])
-demo.launch(debug=True)

 import gradio as gr
+# from PIL import Image
 from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-base")
+processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-base")
+def process_document(image, question):
+    # image = Image.open(image)
+    inputs = processor(images=image, text=question, return_tensors="pt")
     predictions = model.generate(**inputs)
     return processor.decode(predictions[0], skip_special_tokens=True)
+description = "Demo for pix2struct fine-tuned on DocVQA (document visual question answering). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='https://arxiv.org/pdf/2210.03347.pdf' target='_blank'>PIX2STRUCT: SCREENSHOT PARSING AS PRETRAINING FOR VISUAL LANGUAGE UNDERSTANDING</a></p>"
+demo = gr.Interface(
+    fn=process_document,
+    inputs=["image", "text"],
+    outputs="text",
+    title="Demo: pix2struct for DocVQA",
+    description=description,
+    article=article,
+    enable_queue=True,
+    examples=[["example_1.png", "When is the coffee break?"], ["example_2.jpeg", "What's the population of Stoddard?"]],
+    cache_examples=False)
+demo.launch()