|
import gradio as gr |
|
from transformers import AutoImageProcessor, AutoTokenizer, AutoModel |
|
import torch |
|
|
|
repo_id = "OpenGVLab/InternVL2-1B" |
|
|
|
|
|
image_processor = AutoImageProcessor.from_pretrained(repo_id, trust_remote_code=True) |
|
tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True) |
|
model = AutoModel.from_pretrained( |
|
repo_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16 |
|
) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
def analyze_image(image): |
|
try: |
|
img = image.convert("RGB") |
|
text = "describe this image" |
|
|
|
|
|
image_inputs = image_processor(images=img, return_tensors="pt").to(device) |
|
|
|
text_inputs = tokenizer(text, return_tensors="pt").to(device) |
|
|
|
|
|
inputs = { |
|
"input_ids": text_inputs["input_ids"], |
|
"attention_mask": text_inputs["attention_mask"], |
|
"pixel_values": image_inputs["pixel_values"], |
|
} |
|
|
|
|
|
outputs = model.generate(**inputs) |
|
|
|
|
|
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return generated_text |
|
except Exception as e: |
|
return f"An error occurred: {str(e)}" |
|
|
|
demo = gr.Interface( |
|
fn=analyze_image, |
|
inputs=gr.Image(type="pil"), |
|
outputs="text", |
|
title="Image Description using InternVL2-1B", |
|
description="Upload an image and get a description generated by the InternVL2-1B model." |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |