deepseek-ai
/

DeepSeek-Prover-V1.5-Base

Safetensors

llama

Model card Files Files and versions Community

the config class and config.json uses DeepseekConfig, not v2

by winglian - opened Aug 27

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

-7

Files changed (1) hide show

modeling_deepseek.py +7 -7

modeling_deepseek.py CHANGED Viewed

@@ -54,7 +54,7 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_deepseek import DeepseekV2Config
 import torch.distributed as dist
 import numpy as np
@@ -681,7 +681,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 class DeepseekV2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -1190,7 +1190,7 @@ ATTENTION_CLASSES = {
 class DeepseekV2DecoderLayer(nn.Module):
-    def __init__(self, config: DeepseekV2Config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -1287,7 +1287,7 @@ DeepseekV2_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`DeepseekV2Config`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1299,7 +1299,7 @@ DeepseekV2_START_DOCSTRING = r"""
     DeepseekV2_START_DOCSTRING,
 )
 class DeepseekV2PreTrainedModel(PreTrainedModel):
-    config_class = DeepseekV2Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["DeepseekV2DecoderLayer"]
@@ -1398,10 +1398,10 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
     Args:
-        config: DeepseekV2Config
     """
-    def __init__(self, config: DeepseekV2Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size

     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_deepseek import DeepseekConfig
 import torch.distributed as dist
 import numpy as np
 class DeepseekV2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: DeepseekConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
 class DeepseekV2DecoderLayer(nn.Module):
+    def __init__(self, config: DeepseekConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
     and behavior.
     Parameters:
+        config ([`DeepseekConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
     DeepseekV2_START_DOCSTRING,
 )
 class DeepseekV2PreTrainedModel(PreTrainedModel):
+    config_class = DeepseekConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["DeepseekV2DecoderLayer"]
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
     Args:
+        config: DeepseekConfig
     """
+    def __init__(self, config: DeepseekConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size