Upload folder using huggingface_hub
Browse files- configuration_internvl_chat.py +1 -5
- modeling_internvl_chat.py +42 -47
configuration_internvl_chat.py
CHANGED
@@ -27,11 +27,10 @@ class InternVLChatConfig(PretrainedConfig):
|
|
27 |
use_backbone_lora=0,
|
28 |
use_llm_lora=0,
|
29 |
pad2square=False,
|
30 |
-
select_layer=-
|
31 |
force_image_size=None,
|
32 |
downsample_ratio=0.5,
|
33 |
template=None,
|
34 |
-
image_fold=False,
|
35 |
dynamic_image_size=False,
|
36 |
use_thumbnail=False,
|
37 |
ps_version='v1',
|
@@ -62,7 +61,6 @@ class InternVLChatConfig(PretrainedConfig):
|
|
62 |
self.force_image_size = force_image_size
|
63 |
self.downsample_ratio = downsample_ratio
|
64 |
self.template = template
|
65 |
-
self.image_fold = image_fold
|
66 |
self.dynamic_image_size = dynamic_image_size
|
67 |
self.use_thumbnail = use_thumbnail
|
68 |
self.ps_version = ps_version # pixel shuffle version
|
@@ -70,7 +68,6 @@ class InternVLChatConfig(PretrainedConfig):
|
|
70 |
self.max_dynamic_patch = max_dynamic_patch
|
71 |
|
72 |
logger.info(f'vision_select_layer: {self.select_layer}')
|
73 |
-
logger.info(f'image_fold: {self.image_fold}')
|
74 |
logger.info(f'ps_version: {self.ps_version}')
|
75 |
logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
|
76 |
logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
|
@@ -93,7 +90,6 @@ class InternVLChatConfig(PretrainedConfig):
|
|
93 |
output['force_image_size'] = self.force_image_size
|
94 |
output['downsample_ratio'] = self.downsample_ratio
|
95 |
output['template'] = self.template
|
96 |
-
output['image_fold'] = self.image_fold
|
97 |
output['dynamic_image_size'] = self.dynamic_image_size
|
98 |
output['use_thumbnail'] = self.use_thumbnail
|
99 |
output['ps_version'] = self.ps_version
|
|
|
27 |
use_backbone_lora=0,
|
28 |
use_llm_lora=0,
|
29 |
pad2square=False,
|
30 |
+
select_layer=-1,
|
31 |
force_image_size=None,
|
32 |
downsample_ratio=0.5,
|
33 |
template=None,
|
|
|
34 |
dynamic_image_size=False,
|
35 |
use_thumbnail=False,
|
36 |
ps_version='v1',
|
|
|
61 |
self.force_image_size = force_image_size
|
62 |
self.downsample_ratio = downsample_ratio
|
63 |
self.template = template
|
|
|
64 |
self.dynamic_image_size = dynamic_image_size
|
65 |
self.use_thumbnail = use_thumbnail
|
66 |
self.ps_version = ps_version # pixel shuffle version
|
|
|
68 |
self.max_dynamic_patch = max_dynamic_patch
|
69 |
|
70 |
logger.info(f'vision_select_layer: {self.select_layer}')
|
|
|
71 |
logger.info(f'ps_version: {self.ps_version}')
|
72 |
logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
|
73 |
logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
|
|
|
90 |
output['force_image_size'] = self.force_image_size
|
91 |
output['downsample_ratio'] = self.downsample_ratio
|
92 |
output['template'] = self.template
|
|
|
93 |
output['dynamic_image_size'] = self.dynamic_image_size
|
94 |
output['use_thumbnail'] = self.use_thumbnail
|
95 |
output['ps_version'] = self.ps_version
|
modeling_internvl_chat.py
CHANGED
@@ -23,40 +23,6 @@ from .modeling_internlm2 import InternLM2ForCausalLM
|
|
23 |
logger = logging.get_logger(__name__)
|
24 |
|
25 |
|
26 |
-
def window_partition(x, window_size):
|
27 |
-
"""
|
28 |
-
Args:
|
29 |
-
x: (B, C, H, W)
|
30 |
-
window_size (int): window size, assuming square window
|
31 |
-
|
32 |
-
Returns:
|
33 |
-
windows: (num_windows*B, C, window_size, window_size)
|
34 |
-
"""
|
35 |
-
B, C, H, W = x.shape
|
36 |
-
assert H % window_size == 0 and W % window_size == 0, 'H and W must be divisible by window_size'
|
37 |
-
|
38 |
-
x = x.view(B, C, H // window_size, window_size, W // window_size, window_size)
|
39 |
-
windows = x.permute(0, 2, 4, 1, 3, 5).contiguous().view(-1, C, window_size, window_size)
|
40 |
-
return windows
|
41 |
-
|
42 |
-
|
43 |
-
def window_reverse(windows, window_size, H, W):
|
44 |
-
"""
|
45 |
-
Args:
|
46 |
-
windows: (num_windows*B, window_size, window_size, C)
|
47 |
-
window_size (int): Window size
|
48 |
-
H (int): Height of image
|
49 |
-
W (int): Width of image
|
50 |
-
|
51 |
-
Returns:
|
52 |
-
x: (B, H * W, C)
|
53 |
-
"""
|
54 |
-
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
55 |
-
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
|
56 |
-
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H * W, -1)
|
57 |
-
return x
|
58 |
-
|
59 |
-
|
60 |
class InternVLChatModel(PreTrainedModel):
|
61 |
config_class = InternVLChatConfig
|
62 |
main_input_name = 'pixel_values'
|
@@ -72,7 +38,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
72 |
self.template = config.template
|
73 |
self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
|
74 |
self.downsample_ratio = config.downsample_ratio
|
75 |
-
self.image_fold = config.image_fold
|
76 |
self.ps_version = config.ps_version
|
77 |
|
78 |
logger.info(f'num_image_token: {self.num_image_token}')
|
@@ -242,10 +207,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
242 |
return vit_embeds + noise
|
243 |
|
244 |
def extract_feature(self, pixel_values):
|
245 |
-
if self.image_fold:
|
246 |
-
image_size = pixel_values.size(-1) # B, C, H, W
|
247 |
-
pixel_values = window_partition(pixel_values, window_size=image_size // self.image_fold) # 4B, C, H/2, W/2
|
248 |
-
|
249 |
if self.select_layer == -1:
|
250 |
vit_embeds = self.vision_model(
|
251 |
pixel_values=pixel_values,
|
@@ -261,21 +222,55 @@ class InternVLChatModel(PreTrainedModel):
|
|
261 |
if self.training and self.neftune_alpha is not None:
|
262 |
vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
|
263 |
|
264 |
-
if self.image_fold:
|
265 |
-
vit_embeds = window_reverse(vit_embeds, window_size=image_size // (self.image_fold * self.patch_size),
|
266 |
-
H=image_size // self.patch_size, W=image_size // self.patch_size)
|
267 |
-
|
268 |
-
# if torch.distributed.get_rank() == 0:
|
269 |
-
# print("before pixel shuffle:", vit_embeds.shape)
|
270 |
h = w = int(vit_embeds.shape[1] ** 0.5)
|
271 |
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
|
272 |
vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
|
273 |
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
|
274 |
-
# if torch.distributed.get_rank() == 0:
|
275 |
-
# print("after pixel shuffle:", vit_embeds.shape)
|
276 |
vit_embeds = self.mlp1(vit_embeds)
|
277 |
return vit_embeds
|
278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
|
280 |
IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
|
281 |
|
|
|
23 |
logger = logging.get_logger(__name__)
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
class InternVLChatModel(PreTrainedModel):
|
27 |
config_class = InternVLChatConfig
|
28 |
main_input_name = 'pixel_values'
|
|
|
38 |
self.template = config.template
|
39 |
self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
|
40 |
self.downsample_ratio = config.downsample_ratio
|
|
|
41 |
self.ps_version = config.ps_version
|
42 |
|
43 |
logger.info(f'num_image_token: {self.num_image_token}')
|
|
|
207 |
return vit_embeds + noise
|
208 |
|
209 |
def extract_feature(self, pixel_values):
|
|
|
|
|
|
|
|
|
210 |
if self.select_layer == -1:
|
211 |
vit_embeds = self.vision_model(
|
212 |
pixel_values=pixel_values,
|
|
|
222 |
if self.training and self.neftune_alpha is not None:
|
223 |
vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
h = w = int(vit_embeds.shape[1] ** 0.5)
|
226 |
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
|
227 |
vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
|
228 |
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
|
|
|
|
|
229 |
vit_embeds = self.mlp1(vit_embeds)
|
230 |
return vit_embeds
|
231 |
|
232 |
+
def batch_chat(self, tokenizer, pixel_values, image_counts, questions, generation_config, history=None,
|
233 |
+
return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
|
234 |
+
IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
|
235 |
+
if history is not None or return_history:
|
236 |
+
print("Now multi-turn chat is not supported in batch_chat.")
|
237 |
+
raise NotImplementedError
|
238 |
+
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
239 |
+
self.img_context_token_id = img_context_token_id
|
240 |
+
if tokenizer.convert_tokens_to_ids('<|im_end|>') != 0:
|
241 |
+
eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>') # 92542, InternLM2
|
242 |
+
else:
|
243 |
+
eos_token_id = tokenizer.eos_token_id
|
244 |
+
|
245 |
+
from .conversation import get_conv_template
|
246 |
+
|
247 |
+
queries = []
|
248 |
+
image_bs = pixel_values.shape[0]
|
249 |
+
print(f'dynamic ViT batch size: {image_bs}, image_counts: {image_counts}')
|
250 |
+
for idx, image_count in enumerate(image_counts):
|
251 |
+
image_token = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * image_count + IMG_END_TOKEN
|
252 |
+
question = image_token + '\n' + questions[idx]
|
253 |
+
template = get_conv_template(self.template)
|
254 |
+
template.append_message(template.roles[0], question)
|
255 |
+
template.append_message(template.roles[1], None)
|
256 |
+
query = template.get_prompt()
|
257 |
+
queries.append(query)
|
258 |
+
tokenizer.padding_side = 'left'
|
259 |
+
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
260 |
+
input_ids = model_inputs['input_ids'].cuda()
|
261 |
+
attention_mask = model_inputs['attention_mask'].cuda()
|
262 |
+
generation_config['eos_token_id'] = eos_token_id
|
263 |
+
|
264 |
+
generation_output = self.generate(
|
265 |
+
pixel_values=pixel_values,
|
266 |
+
input_ids=input_ids,
|
267 |
+
attention_mask=attention_mask,
|
268 |
+
**generation_config
|
269 |
+
)
|
270 |
+
responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
|
271 |
+
responses = [response.split('<|im_end|>')[0].strip() for response in responses] # for InternLM2
|
272 |
+
return responses
|
273 |
+
|
274 |
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
|
275 |
IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
|
276 |
|