Spaces:

virendravaishnav
/

po-fetch-detail

Running

File size: 1,761 Bytes

import gradio as gr
from transformers import AutoImageProcessor, AutoTokenizer, AutoModel
import torch

repo_id = "OpenGVLab/InternVL2-1B"

# Load the image processor, tokenizer, and model directly from the Hub
image_processor = AutoImageProcessor.from_pretrained(repo_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModel.from_pretrained(
    repo_id,
    trust_remote_code=True,
    torch_dtype=torch.float16  # Use half-precision for efficiency
)

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def analyze_image(image):
    try:
        img = image.convert("RGB")
        text = "describe this image"

        # Process the image
        image_inputs = image_processor(images=img, return_tensors="pt").to(device)
        # Process the text
        text_inputs = tokenizer(text, return_tensors="pt").to(device)

        # Combine the inputs
        inputs = {
            "input_ids": text_inputs["input_ids"],
            "attention_mask": text_inputs["attention_mask"],
            "pixel_values": image_inputs["pixel_values"],
        }

        # Generate outputs
        outputs = model.generate(**inputs)

        # Decode the outputs
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text
    except Exception as e:
        return f"An error occurred: {str(e)}"

demo = gr.Interface(
    fn=analyze_image,
    inputs=gr.Image(type="pil"),
    outputs="text",
    title="Image Description using InternVL2-1B",
    description="Upload an image and get a description generated by the InternVL2-1B model."
)

if __name__ == "__main__":
    demo.launch()