File size: 1,761 Bytes
99df19f 7fee682 5426c44 957d892 6dadcd1 7fee682 a02d815 187f902 a02d815 b5f436b 5426c44 234718c a02d815 1082f74 1229304 7fee682 1229304 7fee682 1229304 1082f74 657edd9 a02d815 1082f74 1229304 7fee682 1229304 a02d815 234718c 05513ce 234718c 6dadcd1 957d892 657edd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import gradio as gr
from transformers import AutoImageProcessor, AutoTokenizer, AutoModel
import torch
repo_id = "OpenGVLab/InternVL2-1B"
# Load the image processor, tokenizer, and model directly from the Hub
image_processor = AutoImageProcessor.from_pretrained(repo_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModel.from_pretrained(
repo_id,
trust_remote_code=True,
torch_dtype=torch.float16 # Use half-precision for efficiency
)
# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def analyze_image(image):
try:
img = image.convert("RGB")
text = "describe this image"
# Process the image
image_inputs = image_processor(images=img, return_tensors="pt").to(device)
# Process the text
text_inputs = tokenizer(text, return_tensors="pt").to(device)
# Combine the inputs
inputs = {
"input_ids": text_inputs["input_ids"],
"attention_mask": text_inputs["attention_mask"],
"pixel_values": image_inputs["pixel_values"],
}
# Generate outputs
outputs = model.generate(**inputs)
# Decode the outputs
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated_text
except Exception as e:
return f"An error occurred: {str(e)}"
demo = gr.Interface(
fn=analyze_image,
inputs=gr.Image(type="pil"),
outputs="text",
title="Image Description using InternVL2-1B",
description="Upload an image and get a description generated by the InternVL2-1B model."
)
if __name__ == "__main__":
demo.launch() |