Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
@@ -28,7 +28,7 @@ from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedToke
|
|
28 |
from typing import List, Union
|
29 |
|
30 |
# Constants
|
31 |
-
CLIP_PATH = "
|
32 |
VLM_PROMPT = "A descriptive caption for this image:\n"
|
33 |
MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
|
34 |
CHECKPOINT_PATH = Path("wpkklhc6")
|
@@ -49,8 +49,8 @@ class ImageAdapter(nn.Module):
|
|
49 |
|
50 |
def load_models():
|
51 |
print("Loading CLIP π")
|
52 |
-
clip_processor = AutoProcessor.from_pretrained(CLIP_PATH
|
53 |
-
clip_model = AutoModel.from_pretrained(CLIP_PATH
|
54 |
|
55 |
print("Loading tokenizer πͺ")
|
56 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
|
@@ -66,7 +66,6 @@ def load_models():
|
|
66 |
|
67 |
return clip_processor, clip_model, tokenizer, text_model, image_adapter
|
68 |
|
69 |
-
|
70 |
@torch.no_grad()
|
71 |
def stream_chat(input_images: List[Image.Image], batch_size: int, pbar: tqdm, models: tuple) -> List[str]:
|
72 |
clip_processor, clip_model, tokenizer, text_model, image_adapter = models
|
|
|
28 |
from typing import List, Union
|
29 |
|
30 |
# Constants
|
31 |
+
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
32 |
VLM_PROMPT = "A descriptive caption for this image:\n"
|
33 |
MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
|
34 |
CHECKPOINT_PATH = Path("wpkklhc6")
|
|
|
49 |
|
50 |
def load_models():
|
51 |
print("Loading CLIP π")
|
52 |
+
clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
|
53 |
+
clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model.eval().requires_grad_(False).to("cuda")
|
54 |
|
55 |
print("Loading tokenizer πͺ")
|
56 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
|
|
|
66 |
|
67 |
return clip_processor, clip_model, tokenizer, text_model, image_adapter
|
68 |
|
|
|
69 |
@torch.no_grad()
|
70 |
def stream_chat(input_images: List[Image.Image], batch_size: int, pbar: tqdm, models: tuple) -> List[str]:
|
71 |
clip_processor, clip_model, tokenizer, text_model, image_adapter = models
|