WEBing commited on
Commit
9eeb018
1 Parent(s): a407a33
Files changed (2) hide show
  1. README.md +6 -8
  2. vision_tower_builder.py +8 -3
README.md CHANGED
@@ -20,10 +20,9 @@ We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Mod
20
  </li>
21
  </ol>
22
 
23
-
24
  ## Quick Start
25
 
26
- ### Installation
27
  See our [github page](https://github.com/KangarooGroup/Kangaroo)
28
 
29
  ### Multi-round Chat with 🤗 Transformers
@@ -31,21 +30,20 @@ See our [github page](https://github.com/KangarooGroup/Kangaroo)
31
  import torch
32
  from transformers import AutoTokenizer, AutoModelForCausalLM
33
 
34
- # replace /path/to/kangaroo to your own path to the model
35
- tokenizer = AutoTokenizer.from_pretrained("/path/to/kangaroo")
36
  model = AutoModelForCausalLM.from_pretrained(
37
- "/path/to/kangaroo",
38
  torch_dtype=torch.bfloat16,
39
  trust_remote_code=True,
40
  )
41
  model = model.to("cuda")
42
  terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
43
 
44
- # replace /path/to/video to your own path to the video
45
  video_path = "/path/to/video"
46
 
47
  # Round 1
48
- query = "Please input your first question."
49
  out, history = model.chat(video_path=video_path,
50
  query=query,
51
  tokenizer=tokenizer,
@@ -57,7 +55,7 @@ out, history = model.chat(video_path=video_path,
57
  print('Assitant: \n', out)
58
 
59
  # Round 2
60
- query = "Please input your second question."
61
  out, history = model.chat(video_path=video_path,
62
  query=query,
63
  tokenizer=tokenizer,
 
20
  </li>
21
  </ol>
22
 
 
23
  ## Quick Start
24
 
25
+ ### Prepare Model and Environment
26
  See our [github page](https://github.com/KangarooGroup/Kangaroo)
27
 
28
  ### Multi-round Chat with 🤗 Transformers
 
30
  import torch
31
  from transformers import AutoTokenizer, AutoModelForCausalLM
32
 
33
+ model_path = "/path/to/kangaroo"
34
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
35
  model = AutoModelForCausalLM.from_pretrained(
36
+ model_path,
37
  torch_dtype=torch.bfloat16,
38
  trust_remote_code=True,
39
  )
40
  model = model.to("cuda")
41
  terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
42
 
 
43
  video_path = "/path/to/video"
44
 
45
  # Round 1
46
+ query = "Give a brief description of the video."
47
  out, history = model.chat(video_path=video_path,
48
  query=query,
49
  tokenizer=tokenizer,
 
55
  print('Assitant: \n', out)
56
 
57
  # Round 2
58
+ query = "What happend at the end of the video?"
59
  out, history = model.chat(video_path=video_path,
60
  query=query,
61
  tokenizer=tokenizer,
vision_tower_builder.py CHANGED
@@ -686,7 +686,7 @@ class CLIPVisionCfg:
686
  timm_proj_bias: bool = False # enable bias final projection
687
  eva_model_name: str = "eva-clip-l-14-448" # a valid eva model name overrides layers, width, patch_size
688
  qkv_bias: bool = True
689
- fusedLN: bool = False
690
  xattn: bool = True
691
  postnorm: bool = False
692
  rope: bool = True
@@ -707,8 +707,13 @@ def build_vision_tower(precision: str = 'bf16'):
707
 
708
 
709
  if vision_cfg.fusedLN:
710
- from apex.normalization import FusedLayerNorm
711
- norm_layer = partial(FusedLayerNorm, eps=1e-6)
 
 
 
 
 
712
  else:
713
  norm_layer = partial(LayerNorm, eps=1e-6)
714
 
 
686
  timm_proj_bias: bool = False # enable bias final projection
687
  eva_model_name: str = "eva-clip-l-14-448" # a valid eva model name overrides layers, width, patch_size
688
  qkv_bias: bool = True
689
+ fusedLN: bool = True
690
  xattn: bool = True
691
  postnorm: bool = False
692
  rope: bool = True
 
707
 
708
 
709
  if vision_cfg.fusedLN:
710
+ try:
711
+ from apex.normalization import FusedLayerNorm
712
+ norm_layer = partial(FusedLayerNorm, eps=1e-6)
713
+ except:
714
+ print("")
715
+ norm_layer = partial(LayerNorm, eps=1e-6)
716
+
717
  else:
718
  norm_layer = partial(LayerNorm, eps=1e-6)
719