This model weight is identical to laion/CLIP-ViT-H-14-laion2B-s32B-b79K, but with the ViT component only. This is to support loading the model as a ClipModel. As a failed to load the original model using AutoModel (feedback appreciated) With this distribution, I was finally able to load from AutoModel, and further support image classification tasks using my self-defined class CLIPViTForImageClassification listed below. However, there is still a small issue that I cannot resolve, I can only load the model if I git clone this repo to local, if I load from web, the loading still fails. ```python from transformers.models.clip.modeling_clip import CLIPPreTrainedModel, CLIPConfig, CLIPVisionTransformer from transformers.modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput, MaskedImageModelingOutput, ) from typing import Dict, List, Optional, Set, Tuple, Union import torch from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss class CLIPViTForImageClassification(CLIPPreTrainedModel): def __init__(self, config: CLIPConfig) -> None: super().__init__(config) self.num_labels = config.num_labels vision_config = config.vision_config self.vision_model = CLIPVisionTransformer(vision_config) # Classifier head self.classifier = nn.Linear(vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() # Initialize weights and apply final processing self.post_init() def forward( self, pixel_values: Optional[torch.Tensor] = None, #head_mask: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, #interpolate_pos_encoding: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, ImageClassifierOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.vision_model( pixel_values, #head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, #interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output[:, 0, :]) loss = None if labels is not None: # move labels to correct device to enable model parallelism labels = labels.to(logits.device) if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return ImageClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) ```