czczup commited on
Commit
3e6175d
1 Parent(s): ef3de63

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +112 -90
README.md CHANGED
@@ -72,10 +72,88 @@ Limitations: Although we have made efforts to ensure the safety of the model dur
72
 
73
  We provide an example code to run InternVL-Chat-V1-5 using `transformers`.
74
 
75
- We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/). Currently, due to the limited GPU resources with public IP addresses, we can only deploy models up to a maximum of 26B. We will expand soon and deploy larger models to the online demo.
76
 
77
  > Please use transformers==4.37.2 to ensure the model works normally.
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  ```python
80
  import numpy as np
81
  import torch
@@ -88,7 +166,6 @@ from transformers import AutoModel, AutoTokenizer
88
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
89
  IMAGENET_STD = (0.229, 0.224, 0.225)
90
 
91
-
92
  def build_transform(input_size):
93
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
94
  transform = T.Compose([
@@ -99,7 +176,6 @@ def build_transform(input_size):
99
  ])
100
  return transform
101
 
102
-
103
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
104
  best_ratio_diff = float('inf')
105
  best_ratio = (1, 1)
@@ -115,8 +191,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
115
  best_ratio = ratio
116
  return best_ratio
117
 
118
-
119
- def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
120
  orig_width, orig_height = image.size
121
  aspect_ratio = orig_width / orig_height
122
 
@@ -154,8 +229,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
154
  processed_images.append(thumbnail_img)
155
  return processed_images
156
 
157
-
158
- def load_image(image_file, input_size=448, max_num=6):
159
  image = Image.open(image_file).convert('RGB')
160
  transform = build_transform(input_size=input_size)
161
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
@@ -163,106 +237,61 @@ def load_image(image_file, input_size=448, max_num=6):
163
  pixel_values = torch.stack(pixel_values)
164
  return pixel_values
165
 
166
-
167
- def split_model(model_name):
168
- device_map = {}
169
- world_size = torch.cuda.device_count()
170
- num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48, 'InternVL-Chat-V1-5': 48,
171
- 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80,}[model_name]
172
- # Since the first GPU will be used for ViT, treat it as half a GPU.
173
- num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
174
- num_layers_per_gpu = [num_layers_per_gpu] * world_size
175
- num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
176
- layer_cnt = 0
177
- for i, num_layer in enumerate(num_layers_per_gpu):
178
- for j in range(num_layer):
179
- device_map[f'language_model.model.layers.{layer_cnt}'] = i
180
- layer_cnt += 1
181
- device_map['vision_model'] = 0
182
- device_map['mlp1'] = 0
183
- device_map['language_model.model.tok_embeddings'] = 0
184
- device_map['language_model.model.embed_tokens'] = 0
185
- device_map['language_model.output'] = 0
186
- device_map['language_model.model.norm'] = 0
187
- device_map['language_model.lm_head'] = 0
188
- device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
189
-
190
- return device_map
191
-
192
-
193
- path = 'OpenGVLab/InternVL-Chat-V1-5'
194
  # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
 
 
195
  model = AutoModel.from_pretrained(
196
  path,
197
  torch_dtype=torch.bfloat16,
198
  low_cpu_mem_usage=True,
199
  trust_remote_code=True).eval().cuda()
200
- # Otherwise, you need to set device_map to use multiple GPUs for inference.
201
- # device_map = split_model('InternVL-Chat-V1-5')
202
- # print(device_map)
203
- # model = AutoModel.from_pretrained(
204
- # path,
205
- # torch_dtype=torch.bfloat16,
206
- # low_cpu_mem_usage=True,
207
- # trust_remote_code=True,
208
- # device_map=device_map).eval()
209
-
210
- tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
211
- # set the max number of tiles in `max_num`
212
- pixel_values = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
213
 
214
- generation_config = dict(
215
- num_beams=1,
216
- max_new_tokens=1024,
217
- do_sample=False,
218
- )
219
 
220
  # pure-text conversation (纯文本对话)
221
  question = 'Hello, who are you?'
222
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
223
- print(f'User: {question}')
224
- print(f'Assistant: {response}')
225
 
226
  question = 'Can you tell me a story?'
227
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
228
- print(f'User: {question}')
229
- print(f'Assistant: {response}')
230
 
231
  # single-image single-round conversation (单图单轮对话)
232
  question = '<image>\nPlease describe the image shortly.'
233
  response = model.chat(tokenizer, pixel_values, question, generation_config)
234
- print(f'User: {question}')
235
- print(f'Assistant: {response}')
236
 
237
  # single-image multi-round conversation (单图多轮对话)
238
  question = '<image>\nPlease describe the image in detail.'
239
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
240
- print(f'User: {question}')
241
- print(f'Assistant: {response}')
242
 
243
  question = 'Please write a poem according to the image.'
244
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
245
- print(f'User: {question}')
246
- print(f'Assistant: {response}')
247
 
248
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
249
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
250
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
251
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
252
 
253
  question = '<image>\nDescribe the two images in detail.'
254
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
255
  history=None, return_history=True)
 
256
 
257
  question = 'What are the similarities and differences between these two images.'
258
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
259
  history=history, return_history=True)
260
- print(f'User: {question}')
261
- print(f'Assistant: {response}')
262
 
263
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
264
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
265
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
266
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
267
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
268
 
@@ -270,19 +299,17 @@ question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detai
270
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
271
  num_patches_list=num_patches_list,
272
  history=None, return_history=True)
273
- print(f'User: {question}')
274
- print(f'Assistant: {response}')
275
 
276
  question = 'What are the similarities and differences between these two images.'
277
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
278
  num_patches_list=num_patches_list,
279
  history=history, return_history=True)
280
- print(f'User: {question}')
281
- print(f'Assistant: {response}')
282
 
283
  # batch inference, single image per sample (单图批处理)
284
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
285
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
286
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
287
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
288
 
@@ -292,8 +319,7 @@ responses = model.batch_chat(tokenizer, pixel_values,
292
  questions=questions,
293
  generation_config=generation_config)
294
  for question, response in zip(questions, responses):
295
- print(f'User: {question}')
296
- print(f'Assistant: {response}')
297
 
298
  # video multi-round conversation (视频多轮对话)
299
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -328,29 +354,23 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
328
  pixel_values = torch.cat(pixel_values_list)
329
  return pixel_values, num_patches_list
330
 
331
-
332
  video_path = './examples/red-panda.mp4'
333
- # pixel_values, num_patches_list = load_video(video_path, num_segments=32, max_num=1)
334
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
335
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
336
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
337
  question = video_prefix + 'What is the red panda doing?'
338
- # Frame1: <image>\nFrame2: <image>\n...\nFrame31: <image>\n{question}
339
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
340
- num_patches_list=num_patches_list,
341
- history=None, return_history=True)
342
- print(f'User: {question}')
343
- print(f'Assistant: {response}')
344
 
345
  question = 'Describe this video in detail. Don\'t repeat.'
346
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
347
- num_patches_list=num_patches_list,
348
- history=history, return_history=True)
349
- print(f'User: {question}')
350
- print(f'Assistant: {response}')
351
  ```
352
 
353
- ### Streaming output
354
 
355
  Besides this method, you can also use the following code to get streamed output.
356
 
@@ -361,7 +381,7 @@ from threading import Thread
361
  # Initialize the streamer
362
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
363
  # Define the generation configuration
364
- generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False, streamer=streamer)
365
  # Start the model chat in a separate thread
366
  thread = Thread(target=model.chat, kwargs=dict(
367
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
@@ -416,6 +436,8 @@ If `ImportError` occurs while executing this case, please install the required d
416
 
417
  When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
418
 
 
 
419
  ```python
420
  from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
421
  from lmdeploy.vl import load_image
 
72
 
73
  We provide an example code to run InternVL-Chat-V1-5 using `transformers`.
74
 
75
+ We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
76
 
77
  > Please use transformers==4.37.2 to ensure the model works normally.
78
 
79
+ ### Model Loading
80
+
81
+ #### 16-bit (bf16 / fp16)
82
+
83
+ ```python
84
+ import torch
85
+ from transformers import AutoTokenizer, AutoModel
86
+ path = "OpenGVLab/InternVL-Chat-V1-5"
87
+ model = AutoModel.from_pretrained(
88
+ path,
89
+ torch_dtype=torch.bfloat16,
90
+ low_cpu_mem_usage=True,
91
+ trust_remote_code=True).eval().cuda()
92
+ ```
93
+
94
+ #### BNB 8-bit Quantization
95
+
96
+ ```python
97
+ import torch
98
+ from transformers import AutoTokenizer, AutoModel
99
+ path = "OpenGVLab/InternVL-Chat-V1-5"
100
+ model = AutoModel.from_pretrained(
101
+ path,
102
+ torch_dtype=torch.bfloat16,
103
+ load_in_8bit=True,
104
+ low_cpu_mem_usage=True,
105
+ trust_remote_code=True).eval()
106
+ ```
107
+
108
+ #### BNB 4-bit Quantization
109
+
110
+ > **⚠️ Warning:** Due to significant quantization errors with BNB 4-bit quantization on InternViT-6B, the model may produce nonsensical outputs and fail to understand images. Therefore, please avoid using BNB 4-bit quantization.
111
+
112
+ #### Multiple GPUs
113
+
114
+ The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
115
+
116
+ ```python
117
+ import math
118
+ import torch
119
+ from transformers import AutoTokenizer, AutoModel
120
+
121
+ def split_model(model_name):
122
+ device_map = {}
123
+ world_size = torch.cuda.device_count()
124
+ num_layers = {'Mini-InternVL-2B-V1-5': 24, 'Mini-InternVL-4B-V1-5': 32, 'InternVL-Chat-V1-5': 48}[model_name]
125
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
126
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
127
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
128
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
129
+ layer_cnt = 0
130
+ for i, num_layer in enumerate(num_layers_per_gpu):
131
+ for j in range(num_layer):
132
+ device_map[f'language_model.model.layers.{layer_cnt}'] = i
133
+ layer_cnt += 1
134
+ device_map['vision_model'] = 0
135
+ device_map['mlp1'] = 0
136
+ device_map['language_model.model.tok_embeddings'] = 0
137
+ device_map['language_model.model.embed_tokens'] = 0
138
+ device_map['language_model.output'] = 0
139
+ device_map['language_model.model.norm'] = 0
140
+ device_map['language_model.lm_head'] = 0
141
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
142
+
143
+ return device_map
144
+
145
+ path = "OpenGVLab/InternVL-Chat-V1-5"
146
+ device_map = split_model('InternVL-Chat-V1-5')
147
+ model = AutoModel.from_pretrained(
148
+ path,
149
+ torch_dtype=torch.bfloat16,
150
+ low_cpu_mem_usage=True,
151
+ trust_remote_code=True,
152
+ device_map=device_map).eval()
153
+ ```
154
+
155
+ ### Inference with Transformers
156
+
157
  ```python
158
  import numpy as np
159
  import torch
 
166
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
167
  IMAGENET_STD = (0.229, 0.224, 0.225)
168
 
 
169
  def build_transform(input_size):
170
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
171
  transform = T.Compose([
 
176
  ])
177
  return transform
178
 
 
179
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
180
  best_ratio_diff = float('inf')
181
  best_ratio = (1, 1)
 
191
  best_ratio = ratio
192
  return best_ratio
193
 
194
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
 
195
  orig_width, orig_height = image.size
196
  aspect_ratio = orig_width / orig_height
197
 
 
229
  processed_images.append(thumbnail_img)
230
  return processed_images
231
 
232
+ def load_image(image_file, input_size=448, max_num=12):
 
233
  image = Image.open(image_file).convert('RGB')
234
  transform = build_transform(input_size=input_size)
235
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
 
237
  pixel_values = torch.stack(pixel_values)
238
  return pixel_values
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
241
+ # Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
242
+ path = 'OpenGVLab/InternVL-Chat-V1-5'
243
  model = AutoModel.from_pretrained(
244
  path,
245
  torch_dtype=torch.bfloat16,
246
  low_cpu_mem_usage=True,
247
  trust_remote_code=True).eval().cuda()
248
+ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ # set the max number of tiles in `max_num`
251
+ pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
252
+ generation_config = dict(max_new_tokens=1024, do_sample=False)
 
 
253
 
254
  # pure-text conversation (纯文本对话)
255
  question = 'Hello, who are you?'
256
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
257
+ print(f'User: {question}\nAssistant: {response}')
 
258
 
259
  question = 'Can you tell me a story?'
260
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
261
+ print(f'User: {question}\nAssistant: {response}')
 
262
 
263
  # single-image single-round conversation (单图单轮对话)
264
  question = '<image>\nPlease describe the image shortly.'
265
  response = model.chat(tokenizer, pixel_values, question, generation_config)
266
+ print(f'User: {question}\nAssistant: {response}')
 
267
 
268
  # single-image multi-round conversation (单图多轮对话)
269
  question = '<image>\nPlease describe the image in detail.'
270
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
271
+ print(f'User: {question}\nAssistant: {response}')
 
272
 
273
  question = 'Please write a poem according to the image.'
274
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
275
+ print(f'User: {question}\nAssistant: {response}')
 
276
 
277
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
278
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
279
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
280
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
281
 
282
  question = '<image>\nDescribe the two images in detail.'
283
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
284
  history=None, return_history=True)
285
+ print(f'User: {question}\nAssistant: {response}')
286
 
287
  question = 'What are the similarities and differences between these two images.'
288
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
289
  history=history, return_history=True)
290
+ print(f'User: {question}\nAssistant: {response}')
 
291
 
292
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
293
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
294
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
295
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
296
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
297
 
 
299
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
300
  num_patches_list=num_patches_list,
301
  history=None, return_history=True)
302
+ print(f'User: {question}\nAssistant: {response}')
 
303
 
304
  question = 'What are the similarities and differences between these two images.'
305
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
306
  num_patches_list=num_patches_list,
307
  history=history, return_history=True)
308
+ print(f'User: {question}\nAssistant: {response}')
 
309
 
310
  # batch inference, single image per sample (单图批处理)
311
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
312
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
313
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
314
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
315
 
 
319
  questions=questions,
320
  generation_config=generation_config)
321
  for question, response in zip(questions, responses):
322
+ print(f'User: {question}\nAssistant: {response}')
 
323
 
324
  # video multi-round conversation (视频多轮对话)
325
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 
354
  pixel_values = torch.cat(pixel_values_list)
355
  return pixel_values, num_patches_list
356
 
 
357
  video_path = './examples/red-panda.mp4'
 
358
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
359
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
360
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
361
  question = video_prefix + 'What is the red panda doing?'
362
+ # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
363
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
364
+ num_patches_list=num_patches_list, history=None, return_history=True)
365
+ print(f'User: {question}\nAssistant: {response}')
 
 
366
 
367
  question = 'Describe this video in detail. Don\'t repeat.'
368
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
369
+ num_patches_list=num_patches_list, history=history, return_history=True)
370
+ print(f'User: {question}\nAssistant: {response}')
 
 
371
  ```
372
 
373
+ #### Streaming output
374
 
375
  Besides this method, you can also use the following code to get streamed output.
376
 
 
381
  # Initialize the streamer
382
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
383
  # Define the generation configuration
384
+ generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
385
  # Start the model chat in a separate thread
386
  thread = Thread(target=model.chat, kwargs=dict(
387
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
 
436
 
437
  When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
438
 
439
+ > Warning: Due to the scarcity of multi-image conversation data, the performance on multi-image tasks may be unstable, and it may require multiple attempts to achieve satisfactory results.
440
+
441
  ```python
442
  from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
443
  from lmdeploy.vl import load_image