carsonhxsu commited on
Commit
2ce2d73
1 Parent(s): 1fb9cb6
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ dist/
2
+ *.egg-info/
3
+ __pycache__
4
+ build/
5
+ .vscode
6
+ .idea
README.md CHANGED
@@ -1,3 +1,79 @@
1
  ---
2
  license: mit
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ language: en
4
+ tags:
5
+ - LLM
6
+ - XVERSE-13B-Chat
7
  ---
8
+ ## Model Card for lyraXVERSE
9
+
10
+ ## Speed
11
+
12
+ * Evaluated at tokens/s
13
+ * test on A100 40G
14
+ * MEMOPT mode
15
+
16
+ ### XVERSE-13B-Chat
17
+
18
+ ## Docker Environment Recommendation
19
+
20
+ - For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
21
+ - For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
22
+
23
+ ```bash
24
+ docker pull nvcr.io/nvidia/pytorch:23.02-py3
25
+ docker run --rm -it --gpus all -v ./:/lyraXVERSE nvcr.io/nvidia/pytorch:23.02-py3
26
+
27
+ pip install -r requirements.txt
28
+ python demo.py
29
+ ```
30
+
31
+ ## Uses
32
+
33
+ ```python
34
+ from lyra_xverse import lyraXVERSE
35
+
36
+ model_path = "./models/"
37
+ tokenizer_path = "./models/"
38
+ inference_dtype = 'fp16'
39
+ prompt = "讲个故事:"
40
+ memopt_mode = 1
41
+ max_output_length = 512
42
+ arch = "Ampere" # Ampere or Volta
43
+ cuda_version = 12 # cuda version, we currently support 11 and 12
44
+
45
+ model = lyraXVERSE(model_path,
46
+ tokenizer_path = tokenizer_path,
47
+ dtype = inference_dtype,
48
+ memopt_mode = memopt_mode,
49
+ arch = arch,
50
+ cuda_version = cuda_version)
51
+
52
+ ```
53
+
54
+ ## Demo Outputs
55
+
56
+ ### XVERSE-13B-Chat
57
+ #### input
58
+
59
+ 讲个故事:
60
+
61
+ #### output
62
+
63
+ 有一天,一位年轻的画家来到了一个偏远的村庄。他以其超凡的绘画技巧,为村民画了一幅美丽的图画。图画里,村庄的周围是翠绿的森林,清澈的溪流在其中流淌,村民们正在劳作,孩子们在田野里嬉戏。村民们看着这幅画,都对这位画家赞不绝口。\n\n村庄的领袖看到了这幅画,他想:“这幅画将会让我们的村庄更加美丽,我们应该让村民们知道这幅画。”于是,他带着画家去村庄的各个角落,让每一个村民都看到了这幅画。\n\n画家看着村民们看画的眼神,他意识到了自己的价值。他意识到,他不仅仅是一个画家,他也是一个能让人们看见希望的人。他的画不仅仅是艺术品,它是连接人们与希望的一座桥梁。\n\n这个故事告诉我们,画家的价值不只是他们的绘画技巧,而是他们的画作带给人们的感动和希望。画家的价值并不在于他们的画有多么昂贵,有多么独特,而在于他们能用画作打开人们的心扉,让人们看见希望,看见生活的美好。
64
+
65
+ ## TODO
66
+
67
+ ## Citation
68
+ ``` bibtex
69
+ @Misc{lyraXVERSE2023,
70
+   author =       {Kangjian Wu, Zhengtao Wang, Yibo Lu, Haoxiong Su, Bin Wu},
71
+   title =        {lyraXVERSE: Accelerating XVERSE-13B-Chat(fp16) to 3000+ tokens/s},
72
+   howpublished = {\url{https://huggingface.co/TMElyralab/lyraXVERSE}},
73
+   year =         {2023}
74
+ }
75
+ ```
76
+
77
+ ## Report bug
78
+ - start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraLLaMA/discussions
79
+ - report bug with a `[bug]` mark in the title.
demo.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lyra_xverse import lyraXVERSE
2
+
3
+ model_path = "./models/"
4
+ tokenizer_path = "./models/"
5
+ inference_dtype = 'fp16'
6
+ prompt = "讲个故事:"
7
+
8
+ memopt_mode = 1
9
+ max_output_length = 512
10
+ arch = "Ampere" # Ampere or Volta
11
+ cuda_version = 12 # cuda version, we currently support 11 and 12
12
+
13
+ model = lyraXVERSE(model_path,
14
+ tokenizer_path = tokenizer_path,
15
+ dtype = inference_dtype,
16
+ memopt_mode = memopt_mode,
17
+ arch = arch,
18
+ cuda_version = cuda_version)
19
+
20
+ bs = 1
21
+ prompts = [prompt, ] * bs
22
+ output_texts = model.generate(
23
+ prompts, output_length=max_output_length,
24
+ top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
25
+
26
+ print(output_texts)
lyra_xverse/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .lyra_xverse import lyraXVERSE
lyra_xverse/config.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import pathlib
3
+ from typing import Optional
4
+
5
+ @dataclasses.dataclass
6
+ class LyraXVERSEParam:
7
+ num_heads: int = 40
8
+ size_per_head: int = 128
9
+ inter_size: int = 13824
10
+ num_layers: int = 40
11
+ vocab_size: int = 39424
12
+ start_id: Optional[int] = 1
13
+ end_id: Optional[int] = 2
14
+ tensor_para_size: int = 1
15
+ pipeline_para_size: int = 1
16
+ remove_padding: bool = True
17
+ shared_contexts_ratio: float = 1.0
18
+ layernorm_eps: float = 1e-6
19
+ weights_data_type: str = "fp16"
20
+ rotary_embedding: int = 128
21
+ use_gptj_residual: bool = False
22
+
23
+ def __post_init__(self):
24
+ if not 0.0 <= self.shared_contexts_ratio <= 1.0:
25
+ raise ValueError(
26
+ f'Got an invalid value of shared_context_ratio '
27
+ f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
28
+
29
+ def asdict(self):
30
+ return dataclasses.asdict(self)
31
+
32
+ LYRA_XVERSE_PARAM = LyraXVERSEParam()
lyra_xverse/lyra_xverse.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import configparser
4
+ import pathlib
5
+ import typing
6
+ import os
7
+
8
+ import torch
9
+ import transformers
10
+ from torch.nn.utils.rnn import pad_sequence
11
+
12
+ from .config import LYRA_XVERSE_PARAM
13
+ from .model import XVERSEModel
14
+
15
+
16
+ class lyraXVERSE:
17
+ def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=1, arch='Ampere', cuda_version=12) -> None:
18
+ self.model_path = model_path
19
+ self.tokenizer_path = tokenizer_path
20
+ self.dtype = dtype
21
+ self.memopt_mode = memopt_mode
22
+ self.arch = arch
23
+ self.cuda_version = cuda_version
24
+
25
+ self.model, self.tokenizer = self.load_model_and_tokenizer()
26
+ print("Got model and tokenizer")
27
+
28
+ def load_model_and_tokenizer(self):
29
+ if self.tokenizer_path is None:
30
+ tokenizer_path = self.model_path
31
+ else:
32
+ tokenizer_path = self.tokenizer_path
33
+
34
+ print(f'Loading tokenizer from {tokenizer_path}')
35
+ tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
36
+
37
+ checkpoint_path = pathlib.Path(self.model_path)
38
+ config_path = checkpoint_path / 'config.ini'
39
+
40
+ if config_path.exists():
41
+ # Read model params from config.
42
+ cfg = configparser.ConfigParser()
43
+ cfg.read(config_path)
44
+ model_name = 'llama'
45
+
46
+ inference_data_type = self.dtype
47
+ if inference_data_type == None:
48
+ inference_data_type = cfg.get(model_name, "weight_data_type")
49
+ model_args = dict(
50
+ head_num=cfg.getint(model_name, 'head_num'),
51
+ size_per_head=cfg.getint(model_name, "size_per_head"),
52
+ inter_size=cfg.getint(model_name, 'inter_size'),
53
+ layer_num=cfg.getint(model_name, "num_layer"),
54
+ rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
55
+ layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
56
+ vocab_size=cfg.getint(model_name, "vocab_size"),
57
+ start_id=cfg.getint(model_name, "start_id"),
58
+ end_id=cfg.getint(model_name, "end_id"),
59
+ weights_data_type=cfg.get(model_name, "weight_data_type"),
60
+ tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
61
+ inference_data_type=inference_data_type)
62
+ else:
63
+ inference_data_type = self.dtype
64
+ if inference_data_type == None:
65
+ inference_data_type = LYRA_XVERSE_PARAM.weights_data_type
66
+ model_args = dict(head_num=LYRA_XVERSE_PARAM.num_heads,
67
+ size_per_head=LYRA_XVERSE_PARAM.size_per_head,
68
+ inter_size=LYRA_XVERSE_PARAM.inter_size,
69
+ layer_num=LYRA_XVERSE_PARAM.num_layers,
70
+ rotary_embedding_dim=LYRA_XVERSE_PARAM.rotary_embedding,
71
+ layernorm_eps=LYRA_XVERSE_PARAM.layernorm_eps,
72
+ vocab_size=LYRA_XVERSE_PARAM.vocab_size,
73
+ start_id=LYRA_XVERSE_PARAM.start_id or tokenizer.bos_token_id,
74
+ end_id=LYRA_XVERSE_PARAM.end_id or tokenizer.eos_token_id,
75
+ weights_data_type=LYRA_XVERSE_PARAM.weights_data_type,
76
+ tensor_para_size=LYRA_XVERSE_PARAM.tensor_para_size,
77
+ inference_data_type=inference_data_type)
78
+
79
+ # update common parameters
80
+
81
+ # Load the C++ model into Pytorch model.
82
+ sm = "sm80"
83
+
84
+ if self.arch == "Ampere":
85
+ sm = "sm80"
86
+ elif self.arch == "Volta":
87
+ sm = "sm70"
88
+ else:
89
+ raise Exception(f"unsupported arch: {self.arch}")
90
+
91
+ cu = 'cu11'
92
+ if self.cuda_version == 11:
93
+ cu = 'cu11'
94
+ elif self.cuda_version == 12:
95
+ cu = 'cu12'
96
+ else:
97
+ raise Exception(f"unsupported cuda version: {self.cuda_version}")
98
+
99
+ lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
100
+ model_args.update(dict(
101
+ lib_path=lib_path,
102
+ model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
103
+ max_seq_len=0, # for position seq embedding
104
+ pipeline_para_size=LYRA_XVERSE_PARAM.pipeline_para_size,
105
+ use_gptj_residual=LYRA_XVERSE_PARAM.use_gptj_residual,
106
+ memopt_mode=self.memopt_mode
107
+ ))
108
+
109
+ print('[FT][INFO] Load Our FT Highly Optimized XVERSE model')
110
+ for k, v in model_args.items():
111
+ print(f' - {k.ljust(25, ".")}: {v}')
112
+
113
+ # Check sanity and consistency between the model and tokenizer.
114
+ checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
115
+ 'tensor_para_size', 'tensor_para_size', 'weights_data_type']
116
+ if None in [model_args[k] for k in checklist]:
117
+ none_params = [p for p in checklist if model_args[p] is None]
118
+ print(f'[FT][WARNING] Found None parameters {none_params}. They must '
119
+ f'be provided either by config file or CLI arguments.')
120
+ if model_args['start_id'] != tokenizer.bos_token_id:
121
+ print('[FT][WARNING] Given start_id is not matched with the bos token '
122
+ 'id of the pretrained tokenizer.')
123
+ if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
124
+ print('[FT][WARNING] Given end_id is not matched with neither pad '
125
+ 'token id nor eos token id of the pretrained tokenizer.')
126
+
127
+ print(f'Loading model from {self.model_path}')
128
+ model = XVERSEModel(**model_args)
129
+ return model, tokenizer
130
+
131
+ def generate(self, prompts: typing.List[str] | str,
132
+ output_length: int = 512,
133
+ beam_width: int = 1,
134
+ top_k: typing.Optional[torch.IntTensor] = 1,
135
+ top_p: typing.Optional[torch.FloatTensor] = 1.0,
136
+ beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
137
+ temperature: typing.Optional[torch.FloatTensor] = 1.0,
138
+ len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
139
+ repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
140
+ presence_penalty: typing.Optional[torch.FloatTensor] = None,
141
+ min_length: typing.Optional[torch.IntTensor] = None,
142
+ bad_words_list: typing.Optional[torch.IntTensor] = None,
143
+ do_sample: bool = False,
144
+ return_output_length: bool = False,
145
+ return_cum_log_probs: int = 0):
146
+ #
147
+ if isinstance(prompts, str):
148
+ prompts = [prompts, ]
149
+
150
+ inputs = prompts
151
+
152
+ batch_size = len(inputs)
153
+ ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
154
+ ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
155
+
156
+ # we must encode the raw prompt text one by one in order to compute the length of the original text.
157
+ input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
158
+ input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
159
+ # after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
160
+ input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
161
+
162
+ random_seed = None
163
+ if do_sample:
164
+ random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
165
+
166
+ outputs = self.model(start_ids=input_token_ids,
167
+ start_lengths=input_lengths,
168
+ output_len=output_length,
169
+ beam_width=beam_width,
170
+ top_k=top_k * ones_int,
171
+ top_p=top_p * ones_float,
172
+ beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
173
+ temperature=temperature * ones_float,
174
+ len_penalty=len_penalty * ones_float,
175
+ repetition_penalty=repetition_penalty * ones_float,
176
+ random_seed=random_seed,
177
+ return_output_length=return_output_length,
178
+ return_cum_log_probs=return_cum_log_probs)
179
+
180
+ if return_cum_log_probs > 0:
181
+ outputs = outputs[0] # output_token_ids.
182
+
183
+ # Slice the generated token ids of the 1st beam result.
184
+ # output = input tokens + generated tokens.
185
+ output_token_ids = [out[0, length:].cpu()
186
+ for out, length in zip(outputs, input_lengths)]
187
+
188
+ output_texts = self.tokenizer.batch_decode(
189
+ output_token_ids, skip_special_tokens=True)
190
+
191
+ return output_texts
lyra_xverse/model.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import print_function
16
+
17
+ import copy
18
+ import os
19
+ import pathlib
20
+ import typing
21
+
22
+ import numpy as np
23
+ import torch
24
+ import torch.distributed as dist
25
+ import torch.nn as nn
26
+
27
+ str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
28
+
29
+ class XVERSEModel(nn.Module):
30
+ def __init__(self,
31
+ head_num,
32
+ size_per_head,
33
+ inter_size,
34
+ vocab_size,
35
+ rotary_embedding_dim,
36
+ start_id, end_id, layer_num,
37
+ max_seq_len: int,
38
+ layernorm_eps,
39
+ tensor_para_size: int,
40
+ pipeline_para_size: int,
41
+ use_gptj_residual,
42
+ lib_path: typing.Union[str, pathlib.Path],
43
+ model_path,
44
+ memopt_mode: int = 1,
45
+ inference_data_type: str = "fp16",
46
+ weights_data_type: typing.Union[str, np.dtype] = np.float32):
47
+ super().__init__()
48
+ self.head_num = head_num
49
+ self.size_per_head = size_per_head
50
+ self.inter_size = inter_size
51
+ self.vocab_size = vocab_size
52
+ self.rotary_embedding_dim = rotary_embedding_dim
53
+ self.start_id = start_id
54
+ self.end_id = end_id
55
+ self.max_seq_len = max_seq_len
56
+ self.layer_num = layer_num
57
+ self.use_gptj_residual = use_gptj_residual
58
+ self.layernorm_eps = layernorm_eps
59
+ self.memopt_mode = memopt_mode
60
+
61
+ # multi-gpu params
62
+ self.tensor_para_size = tensor_para_size
63
+ self.pipeline_para_size = pipeline_para_size
64
+ self.build_model = False
65
+ self.weights_data_type = weights_data_type
66
+ self.inference_data_type = inference_data_type
67
+
68
+ assert torch.cuda.is_available(), "CUDA is required for this model."
69
+
70
+ assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
71
+ assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
72
+
73
+ # Load the C++ model into Pytorch model.
74
+ torch.classes.load_library(os.path.abspath(lib_path))
75
+
76
+ # Prepare for tensor/pipeline parallel
77
+ try:
78
+ dist.init_process_group(backend='mpi')
79
+ except:
80
+ print("[INFO] WARNING: Have initialized the process group")
81
+ self.rank = dist.get_rank()
82
+ self.device_count = torch.cuda.device_count()
83
+ self.device = self.rank % self.device_count
84
+ torch.cuda.set_device(self.device)
85
+
86
+ world_size = dist.get_world_size()
87
+ # print(tensor_para_size * pipeline_para_size)
88
+ assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
89
+
90
+ self.tensor_para_rank = self.rank % self.tensor_para_size
91
+ self.pipeline_para_rank = self.rank // self.tensor_para_size
92
+
93
+ self.model = torch.classes.FasterTransformer.LlamaOp(
94
+ self.head_num, self.size_per_head, self.inter_size,
95
+ self.layer_num,
96
+ self.vocab_size,
97
+ self.rotary_embedding_dim,
98
+ self.layernorm_eps,
99
+ self.start_id, self.end_id,
100
+ self.tensor_para_size, self.pipeline_para_size,
101
+ self.max_seq_len,
102
+ self.use_gptj_residual,
103
+ self.memopt_mode,
104
+ model_path,
105
+ self.weights_data_type,
106
+ self.inference_data_type)
107
+
108
+ self.build_model = True
109
+ torch.cuda.empty_cache()
110
+
111
+ def forward(self,
112
+ start_ids: torch.Tensor,
113
+ start_lengths: torch.Tensor,
114
+ output_len,
115
+ beam_width=1,
116
+ top_k: torch.Tensor = None,
117
+ top_p: torch.Tensor = None,
118
+ beam_search_diversity_rate: torch.Tensor = None,
119
+ temperature: torch.Tensor = None,
120
+ len_penalty: torch.Tensor = None,
121
+ repetition_penalty: torch.Tensor = None,
122
+ random_seed: torch.Tensor = None,
123
+ return_output_length=False,
124
+ return_cum_log_probs=0):
125
+
126
+ input_len = start_ids.size(1)
127
+ assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
128
+
129
+ # Inputs to device
130
+ input_ids = start_ids.cuda(self.device)
131
+ input_lengths = start_lengths.cuda(self.device)
132
+ # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
133
+ outputs = self.model.forward(input_ids,
134
+ input_lengths,
135
+ output_len,
136
+ beam_width, # optional, can be None
137
+ top_k, # optional, can be None
138
+ top_p, # optional, can be None
139
+ beam_search_diversity_rate, # optional, can be None
140
+ temperature, # optional, can be None
141
+ len_penalty, # optional, can be None
142
+ repetition_penalty, # optional, can be None
143
+ random_seed, # optional, can be None
144
+ return_cum_log_probs) # optional, can be None
145
+
146
+ if return_cum_log_probs == 0:
147
+ output_ids, output_lengths = outputs
148
+ else:
149
+ output_ids, output_lengths, output_cum_log_probs = outputs
150
+ if return_output_length:
151
+ if return_cum_log_probs > 0:
152
+ return output_ids, output_lengths, output_cum_log_probs
153
+ else:
154
+ return output_ids, output_lengths
155
+ else:
156
+ return output_ids
157
+
158
+ def set_input_tensor(self, input_tensor):
159
+ """Set input tensor to be used instead of forward()'s input.
160
+
161
+ When doing pipeline parallelism the input from the previous
162
+ stage comes from communication, not from the input, so the
163
+ model's forward_step_func won't have it. This function is thus
164
+ used by internal code to bypass the input provided by the
165
+ forward_step_func"""
166
+ self.input_tensor = input_tensor
models/config.ini ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [llama]
2
+ model_name = XVERSE
3
+ head_num = 40
4
+ size_per_head = 128
5
+ inter_size = 13824
6
+ num_layer = 40
7
+ rotary_embedding = 128
8
+ layernorm_eps = 1e-06
9
+ vocab_size = 100278
10
+ start_id = 2
11
+ end_id = 3
12
+ tensor_para_size = 1
13
+ weight_data_type = fp32
14
+
models/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
models/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/tokenizer_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "model_max_length": 1000000000000000019884624838656,
4
+ "tokenizer_class": "PreTrainedTokenizerFast"
5
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ numpy
3
+ setuptools
4
+ torch
5
+ sentencepiece