d-matrix
/

gpt-j-6b

@@ -3,10 +3,6 @@
   "architectures": [
     "GPTJForCausalLM"
   ],
-  "auto_map": {
-    "AutoConfig": "configuration_gptj.GPTJConfig",
-    "AutoModelForCausalLM": "modeling_gptj.GPTJForCausalLM"
-  },
   "attn_pdrop": 0.0,
   "bos_token_id": 50256,
   "embd_pdrop": 0.0,

   "architectures": [
     "GPTJForCausalLM"
   ],
   "attn_pdrop": 0.0,
   "bos_token_id": 50256,
   "embd_pdrop": 0.0,

modeling_gptj.py CHANGED Viewed

@@ -38,7 +38,6 @@ from transformers.utils import (
     is_torch_fx_proxy,
     logging,
 )
-from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
 from .configuration_gptj import GPTJConfig
@@ -453,54 +452,6 @@ GPTJ_INPUTS_DOCSTRING = r"""
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
-PARALLELIZE_DOCSTRING = r"""
-    This is an experimental feature and is a subject to change at a moment's notice. Uses a device map to distribute
-    attention modules of the model across several devices. If no device map is given, it will evenly distribute blocks
-    across all devices.
-    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the
-            following number of attention modules:
-                - gpt-j-6B: 28
-    Example:
-    ```python
-    # Here is an example of a device map on a machine with 4 GPUs using gpt-j-6B, which has a total of 28 attention modules:
-    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
-    device_map = {
-        0: [0, 1, 2, 3, 4, 5, 6],
-        1: [7, 8, 9, 10, 11, 12, 13],
-        2: [14, 15, 16, 17, 18, 19, 20],
-        3: [21, 22, 23, 24, 25, 26, 27],
-    }
-    model.parallelize(device_map)
-    ```
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to CPU from a model parallel state.
-    Example:
-    ```python
-    # On a 4 GPU machine with gpt-j-6B:
-    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
-    device_map = {
-        0: [0, 1, 2, 3, 4, 5, 6],
-        1: [7, 8, 9, 10, 11, 12, 13],
-        2: [14, 15, 16, 17, 18, 19, 20],
-        3: [21, 22, 23, 24, 25, 26, 27],
-    }
-    model.parallelize(device_map)  # Splits the model across several devices
-    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-    ```
-"""
 @add_start_docstrings(
     "The bare GPT-J Model transformer outputting raw hidden-states without any specific head on top.",
@@ -517,62 +468,11 @@ class GPTJModel(GPTJPreTrainedModel):
         self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)])
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        warnings.warn(
-            "`GPTJModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
-            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
-            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
-            " ...}",
-            FutureWarning,
-        )
-        # Check validity of device_map
-        self.device_map = (
-            get_device_map(len(self.h), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.h))
-        self.model_parallel = True
-        self.first_device = (
-            "cpu"
-            if "cpu" in self.device_map.keys()
-            else "cuda:" + str(min(self.device_map.keys()))
-        )
-        self.last_device = "cuda:" + str(max(self.device_map.keys()))
-        self.wte = self.wte.to(self.first_device)
-        # Load onto devices
-        for k, v in self.device_map.items():
-            for block in v:
-                cuda_device = "cuda:" + str(k)
-                self.h[block] = self.h[block].to(cuda_device)
-        # ln_f to last
-        self.ln_f = self.ln_f.to(self.last_device)
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        warnings.warn(
-            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
-            FutureWarning,
-        )
-        self.model_parallel = False
-        self.device_map = None
-        self.first_device = "cpu"
-        self.last_device = "cpu"
-        self.wte = self.wte.to("cpu")
-        for index in range(len(self.h)):
-            self.h[index] = self.h[index].to("cpu")
-        self.ln_f = self.ln_f.to("cpu")
-        torch.cuda.empty_cache()
     def get_input_embeddings(self):
         return self.wte
@@ -702,19 +602,16 @@ class GPTJModel(GPTJPreTrainedModel):
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(
-                        past_state.to(hidden_states.device) for past_state in layer_past
-                    )
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -749,12 +646,6 @@ class GPTJModel(GPTJPreTrainedModel):
                     outputs[2 if use_cache else 1],
                 )
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
         hidden_states = self.ln_f(hidden_states)
         hidden_states = hidden_states.view(output_shape)
@@ -796,44 +687,9 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
         self.transformer = GPTJModel(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
         # Initialize weights and apply final processing
         self.post_init()
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        warnings.warn(
-            "`GPTJForCausalLM.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
-            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
-            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
-            " 0, 'transformer.h.1': 1, ...}",
-            FutureWarning,
-        )
-        self.device_map = (
-            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.transformer.h))
-        self.transformer.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.transformer.first_device)
-        self.model_parallel = True
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        warnings.warn(
-            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
-            FutureWarning,
-        )
-        self.transformer.deparallelize()
-        self.transformer = self.transformer.to("cpu")
-        self.lm_head = self.lm_head.to("cpu")
-        self.model_parallel = False
-        torch.cuda.empty_cache()
     def get_output_embeddings(self):
         return self.lm_head
@@ -937,9 +793,7 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
         hidden_states = transformer_outputs[0]
         # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
         # make sure sampling in fp16 works correctly and
         # compute loss in fp32 to match with mesh-tf version
@@ -1013,10 +867,6 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
         self.transformer = GPTJModel(config)
         self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
         # Initialize weights and apply final processing
         self.post_init()
@@ -1154,10 +1004,6 @@ class GPTJForQuestionAnswering(GPTJPreTrainedModel):
         self.transformer = GPTJModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
         # Initialize weights and apply final processing
         self.post_init()

     is_torch_fx_proxy,
     logging,
 )
 from .configuration_gptj import GPTJConfig
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 @add_start_docstrings(
     "The bare GPT-J Model transformer outputting raw hidden-states without any specific head on top.",
         self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)])
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
     def get_input_embeddings(self):
         return self.wte
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            # Ensure layer_past is on same device as hidden_states (might not be correct)
+            if layer_past is not None:
+                layer_past = tuple(
+                    past_state.to(hidden_states.device) for past_state in layer_past
+                )
+            # Ensure that attention_mask is always on the same device as hidden_states
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(hidden_states.device)
+            if isinstance(head_mask, torch.Tensor):
+                head_mask = head_mask.to(hidden_states.device)
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
                     outputs[2 if use_cache else 1],
                 )
         hidden_states = self.ln_f(hidden_states)
         hidden_states = hidden_states.view(output_shape)
         self.transformer = GPTJModel(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
         # Initialize weights and apply final processing
         self.post_init()
     def get_output_embeddings(self):
         return self.lm_head
         hidden_states = transformer_outputs[0]
         # Set device for model parallelism
+        hidden_states = hidden_states.to(self.lm_head.weight.device)
         # make sure sampling in fp16 works correctly and
         # compute loss in fp32 to match with mesh-tf version
         self.transformer = GPTJModel(config)
         self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
         self.transformer = GPTJModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
         # Initialize weights and apply final processing
         self.post_init()

test.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import evaluate
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM
+perplexity = evaluate.load("d-matrix/dmx_perplexity", module_type="metric")
+input_texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10]
+model = AutoModelForCausalLM.from_pretrained(
+    pretrained_model_name_or_path="d-matrix/gpt-j-6b",
+    # trust_remote_code=True,
+    device_map="auto",
+    use_auth_token=True,
+)
+results = perplexity.compute(model=model, references=input_texts)