Wi-zz
/

joy-caption-pre-alpha

Wi-zz commited on Aug 25

Commit

4bfda25

•

1 Parent(s): 7033d9d

Upload app.py with huggingface_hub

Files changed (1) hide show

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedToke
 from typing import List, Union
 # Constants
-CLIP_PATH = "OpenGVLab/InternViT-300M-448px"
 VLM_PROMPT = "A descriptive caption for this image:\n"
 MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
 CHECKPOINT_PATH = Path("wpkklhc6")
@@ -49,8 +49,8 @@ class ImageAdapter(nn.Module):
 def load_models():
     print("Loading CLIP 📎")
-    clip_processor = AutoProcessor.from_pretrained(CLIP_PATH, trust_remote_code=True)
-    clip_model = AutoModel.from_pretrained(CLIP_PATH, trust_remote_code=True).vision_model.eval().requires_grad_(False).to("cuda")
     print("Loading tokenizer 🪙")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
@@ -66,7 +66,6 @@ def load_models():
     return clip_processor, clip_model, tokenizer, text_model, image_adapter
 @torch.no_grad()
 def stream_chat(input_images: List[Image.Image], batch_size: int, pbar: tqdm, models: tuple) -> List[str]:
     clip_processor, clip_model, tokenizer, text_model, image_adapter = models

 from typing import List, Union
 # Constants
+CLIP_PATH = "google/siglip-so400m-patch14-384"
 VLM_PROMPT = "A descriptive caption for this image:\n"
 MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
 CHECKPOINT_PATH = Path("wpkklhc6")
 def load_models():
     print("Loading CLIP 📎")
+    clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
+    clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model.eval().requires_grad_(False).to("cuda")
     print("Loading tokenizer 🪙")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
     return clip_processor, clip_model, tokenizer, text_model, image_adapter
 @torch.no_grad()
 def stream_chat(input_images: List[Image.Image], batch_size: int, pbar: tqdm, models: tuple) -> List[str]:
     clip_processor, clip_model, tokenizer, text_model, image_adapter = models