KangarooGroup
/

kangaroo

@@ -20,10 +20,9 @@ We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Mod
     </li>
 </ol>
 ## Quick Start
-### Installation
 See our [github page](https://github.com/KangarooGroup/Kangaroo)
 ### Multi-round Chat with 🤗 Transformers
@@ -31,21 +30,20 @@ See our [github page](https://github.com/KangarooGroup/Kangaroo)
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# replace /path/to/kangaroo to your own path to the model
-tokenizer = AutoTokenizer.from_pretrained("/path/to/kangaroo")
 model = AutoModelForCausalLM.from_pretrained(
-    "/path/to/kangaroo",
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
 )
 model = model.to("cuda")
 terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
-# replace /path/to/video to your own path to the video
 video_path = "/path/to/video"
 # Round 1
-query = "Please input your first question."
 out, history = model.chat(video_path=video_path,
                           query=query,
                           tokenizer=tokenizer,
@@ -57,7 +55,7 @@ out, history = model.chat(video_path=video_path,
 print('Assitant: \n', out)
 # Round 2
-query = "Please input your second question."
 out, history = model.chat(video_path=video_path,
                           query=query,
                           tokenizer=tokenizer,

     </li>
 </ol>
 ## Quick Start
+### Prepare Model and Environment
 See our [github page](https://github.com/KangarooGroup/Kangaroo)
 ### Multi-round Chat with 🤗 Transformers
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+model_path = "/path/to/kangaroo"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(
+    model_path,
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
 )
 model = model.to("cuda")
 terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
 video_path = "/path/to/video"
 # Round 1
+query = "Give a brief description of the video."
 out, history = model.chat(video_path=video_path,
                           query=query,
                           tokenizer=tokenizer,
 print('Assitant: \n', out)
 # Round 2
+query = "What happend at the end of the video?"
 out, history = model.chat(video_path=video_path,
                           query=query,
                           tokenizer=tokenizer,

vision_tower_builder.py CHANGED Viewed

@@ -686,7 +686,7 @@ class CLIPVisionCfg:
     timm_proj_bias: bool = False  # enable bias final projection
     eva_model_name: str = "eva-clip-l-14-448" # a valid eva model name overrides layers, width, patch_size
     qkv_bias: bool = True
-    fusedLN: bool = False
     xattn: bool = True
     postnorm: bool = False
     rope: bool = True
@@ -707,8 +707,13 @@ def build_vision_tower(precision: str = 'bf16'):
     if vision_cfg.fusedLN:
-        from apex.normalization import FusedLayerNorm
-        norm_layer = partial(FusedLayerNorm, eps=1e-6)
     else:
         norm_layer = partial(LayerNorm, eps=1e-6)

     timm_proj_bias: bool = False  # enable bias final projection
     eva_model_name: str = "eva-clip-l-14-448" # a valid eva model name overrides layers, width, patch_size
     qkv_bias: bool = True
+    fusedLN: bool = True
     xattn: bool = True
     postnorm: bool = False
     rope: bool = True
     if vision_cfg.fusedLN:
+        try:
+            from apex.normalization import FusedLayerNorm
+            norm_layer = partial(FusedLayerNorm, eps=1e-6)
+        except:
+            print("")
+            norm_layer = partial(LayerNorm, eps=1e-6)
     else:
         norm_layer = partial(LayerNorm, eps=1e-6)