import gradio as gr import clip import torch import utils clip_model = "RN50x4" device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load(clip_model, device=device, jit=False) model.eval() def grad_cam_fn(text, img, saliency_layer): resize = model.visual.input_resolution img = img.resize((resize, resize)) text_input = clip.tokenize([text]).to(device) text_feature = model.encode_text(text_input).float() image_input = preprocess(img).unsqueeze(0).to(device) attn_map = utils.gradCAM( model.visual, image_input, text_feature, getattr(model.visual, saliency_layer) ) attn_map = attn_map.squeeze().detach().cpu().numpy() attn_map = utils.getAttMap(img, attn_map) return attn_map interface = gr.Interface( fn=grad_cam_fn, inputs=[ gr.inputs.Textbox( label="Target Text", lines=1), gr.inputs.Image( label='Input Image', image_mode="RGB", type='pil', shape=(512, 512)), gr.inputs.Dropdown( ["layer4", "layer3", "layer2", "layer1"], default="layer4", label="Saliency Layer") ], outputs=gr.outputs.Image( type="pil", label="Attention Map"), examples=[ ['a cat lying on the floor', 'assets/cat_dog.jpg', 'layer4'], ['a dog sitting', 'assets/cat_dog.jpg', 'layer4'] ], description="OpenAI CLIP Grad CAM") interface.launch()