carsonhxsu
commited on
Commit
•
2ce2d73
1
Parent(s):
1fb9cb6
init
Browse files- .gitignore +6 -0
- README.md +76 -0
- demo.py +26 -0
- lyra_xverse/__init__.py +1 -0
- lyra_xverse/config.py +32 -0
- lyra_xverse/lyra_xverse.py +191 -0
- lyra_xverse/model.py +166 -0
- models/config.ini +14 -0
- models/special_tokens_map.json +23 -0
- models/tokenizer.json +0 -0
- models/tokenizer_config.json +5 -0
- requirements.txt +5 -0
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dist/
|
2 |
+
*.egg-info/
|
3 |
+
__pycache__
|
4 |
+
build/
|
5 |
+
.vscode
|
6 |
+
.idea
|
README.md
CHANGED
@@ -1,3 +1,79 @@
|
|
1 |
---
|
2 |
license: mit
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: mit
|
3 |
+
language: en
|
4 |
+
tags:
|
5 |
+
- LLM
|
6 |
+
- XVERSE-13B-Chat
|
7 |
---
|
8 |
+
## Model Card for lyraXVERSE
|
9 |
+
|
10 |
+
## Speed
|
11 |
+
|
12 |
+
* Evaluated at tokens/s
|
13 |
+
* test on A100 40G
|
14 |
+
* MEMOPT mode
|
15 |
+
|
16 |
+
### XVERSE-13B-Chat
|
17 |
+
|
18 |
+
## Docker Environment Recommendation
|
19 |
+
|
20 |
+
- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
|
21 |
+
- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
|
22 |
+
|
23 |
+
```bash
|
24 |
+
docker pull nvcr.io/nvidia/pytorch:23.02-py3
|
25 |
+
docker run --rm -it --gpus all -v ./:/lyraXVERSE nvcr.io/nvidia/pytorch:23.02-py3
|
26 |
+
|
27 |
+
pip install -r requirements.txt
|
28 |
+
python demo.py
|
29 |
+
```
|
30 |
+
|
31 |
+
## Uses
|
32 |
+
|
33 |
+
```python
|
34 |
+
from lyra_xverse import lyraXVERSE
|
35 |
+
|
36 |
+
model_path = "./models/"
|
37 |
+
tokenizer_path = "./models/"
|
38 |
+
inference_dtype = 'fp16'
|
39 |
+
prompt = "讲个故事:"
|
40 |
+
memopt_mode = 1
|
41 |
+
max_output_length = 512
|
42 |
+
arch = "Ampere" # Ampere or Volta
|
43 |
+
cuda_version = 12 # cuda version, we currently support 11 and 12
|
44 |
+
|
45 |
+
model = lyraXVERSE(model_path,
|
46 |
+
tokenizer_path = tokenizer_path,
|
47 |
+
dtype = inference_dtype,
|
48 |
+
memopt_mode = memopt_mode,
|
49 |
+
arch = arch,
|
50 |
+
cuda_version = cuda_version)
|
51 |
+
|
52 |
+
```
|
53 |
+
|
54 |
+
## Demo Outputs
|
55 |
+
|
56 |
+
### XVERSE-13B-Chat
|
57 |
+
#### input
|
58 |
+
|
59 |
+
讲个故事:
|
60 |
+
|
61 |
+
#### output
|
62 |
+
|
63 |
+
有一天,一位年轻的画家来到了一个偏远的村庄。他以其超凡的绘画技巧,为村民画了一幅美丽的图画。图画里,村庄的周围是翠绿的森林,清澈的溪流在其中流淌,村民们正在劳作,孩子们在田野里嬉戏。村民们看着这幅画,都对这位画家赞不绝口。\n\n村庄的领袖看到了这幅画,他想:“这幅画将会让我们的村庄更加美丽,我们应该让村民们知道这幅画。”于是,他带着画家去村庄的各个角落,让每一个村民都看到了这幅画。\n\n画家看着村民们看画的眼神,他意识到了自己的价值。他意识到,他不仅仅是一个画家,他也是一个能让人们看见希望的人。他的画不仅仅是艺术品,它是连接人们与希望的一座桥梁。\n\n这个故事告诉我们,画家的价值不只是他们的绘画技巧,而是他们的画作带给人们的感动和希望。画家的价值并不在于他们的画有多么昂贵,有多么独特,而在于他们能用画作打开人们的心扉,让人们看见希望,看见生活的美好。
|
64 |
+
|
65 |
+
## TODO
|
66 |
+
|
67 |
+
## Citation
|
68 |
+
``` bibtex
|
69 |
+
@Misc{lyraXVERSE2023,
|
70 |
+
author = {Kangjian Wu, Zhengtao Wang, Yibo Lu, Haoxiong Su, Bin Wu},
|
71 |
+
title = {lyraXVERSE: Accelerating XVERSE-13B-Chat(fp16) to 3000+ tokens/s},
|
72 |
+
howpublished = {\url{https://huggingface.co/TMElyralab/lyraXVERSE}},
|
73 |
+
year = {2023}
|
74 |
+
}
|
75 |
+
```
|
76 |
+
|
77 |
+
## Report bug
|
78 |
+
- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraLLaMA/discussions
|
79 |
+
- report bug with a `[bug]` mark in the title.
|
demo.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lyra_xverse import lyraXVERSE
|
2 |
+
|
3 |
+
model_path = "./models/"
|
4 |
+
tokenizer_path = "./models/"
|
5 |
+
inference_dtype = 'fp16'
|
6 |
+
prompt = "讲个故事:"
|
7 |
+
|
8 |
+
memopt_mode = 1
|
9 |
+
max_output_length = 512
|
10 |
+
arch = "Ampere" # Ampere or Volta
|
11 |
+
cuda_version = 12 # cuda version, we currently support 11 and 12
|
12 |
+
|
13 |
+
model = lyraXVERSE(model_path,
|
14 |
+
tokenizer_path = tokenizer_path,
|
15 |
+
dtype = inference_dtype,
|
16 |
+
memopt_mode = memopt_mode,
|
17 |
+
arch = arch,
|
18 |
+
cuda_version = cuda_version)
|
19 |
+
|
20 |
+
bs = 1
|
21 |
+
prompts = [prompt, ] * bs
|
22 |
+
output_texts = model.generate(
|
23 |
+
prompts, output_length=max_output_length,
|
24 |
+
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
|
25 |
+
|
26 |
+
print(output_texts)
|
lyra_xverse/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .lyra_xverse import lyraXVERSE
|
lyra_xverse/config.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
import pathlib
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
@dataclasses.dataclass
|
6 |
+
class LyraXVERSEParam:
|
7 |
+
num_heads: int = 40
|
8 |
+
size_per_head: int = 128
|
9 |
+
inter_size: int = 13824
|
10 |
+
num_layers: int = 40
|
11 |
+
vocab_size: int = 39424
|
12 |
+
start_id: Optional[int] = 1
|
13 |
+
end_id: Optional[int] = 2
|
14 |
+
tensor_para_size: int = 1
|
15 |
+
pipeline_para_size: int = 1
|
16 |
+
remove_padding: bool = True
|
17 |
+
shared_contexts_ratio: float = 1.0
|
18 |
+
layernorm_eps: float = 1e-6
|
19 |
+
weights_data_type: str = "fp16"
|
20 |
+
rotary_embedding: int = 128
|
21 |
+
use_gptj_residual: bool = False
|
22 |
+
|
23 |
+
def __post_init__(self):
|
24 |
+
if not 0.0 <= self.shared_contexts_ratio <= 1.0:
|
25 |
+
raise ValueError(
|
26 |
+
f'Got an invalid value of shared_context_ratio '
|
27 |
+
f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
|
28 |
+
|
29 |
+
def asdict(self):
|
30 |
+
return dataclasses.asdict(self)
|
31 |
+
|
32 |
+
LYRA_XVERSE_PARAM = LyraXVERSEParam()
|
lyra_xverse/lyra_xverse.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import configparser
|
4 |
+
import pathlib
|
5 |
+
import typing
|
6 |
+
import os
|
7 |
+
|
8 |
+
import torch
|
9 |
+
import transformers
|
10 |
+
from torch.nn.utils.rnn import pad_sequence
|
11 |
+
|
12 |
+
from .config import LYRA_XVERSE_PARAM
|
13 |
+
from .model import XVERSEModel
|
14 |
+
|
15 |
+
|
16 |
+
class lyraXVERSE:
|
17 |
+
def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=1, arch='Ampere', cuda_version=12) -> None:
|
18 |
+
self.model_path = model_path
|
19 |
+
self.tokenizer_path = tokenizer_path
|
20 |
+
self.dtype = dtype
|
21 |
+
self.memopt_mode = memopt_mode
|
22 |
+
self.arch = arch
|
23 |
+
self.cuda_version = cuda_version
|
24 |
+
|
25 |
+
self.model, self.tokenizer = self.load_model_and_tokenizer()
|
26 |
+
print("Got model and tokenizer")
|
27 |
+
|
28 |
+
def load_model_and_tokenizer(self):
|
29 |
+
if self.tokenizer_path is None:
|
30 |
+
tokenizer_path = self.model_path
|
31 |
+
else:
|
32 |
+
tokenizer_path = self.tokenizer_path
|
33 |
+
|
34 |
+
print(f'Loading tokenizer from {tokenizer_path}')
|
35 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
|
36 |
+
|
37 |
+
checkpoint_path = pathlib.Path(self.model_path)
|
38 |
+
config_path = checkpoint_path / 'config.ini'
|
39 |
+
|
40 |
+
if config_path.exists():
|
41 |
+
# Read model params from config.
|
42 |
+
cfg = configparser.ConfigParser()
|
43 |
+
cfg.read(config_path)
|
44 |
+
model_name = 'llama'
|
45 |
+
|
46 |
+
inference_data_type = self.dtype
|
47 |
+
if inference_data_type == None:
|
48 |
+
inference_data_type = cfg.get(model_name, "weight_data_type")
|
49 |
+
model_args = dict(
|
50 |
+
head_num=cfg.getint(model_name, 'head_num'),
|
51 |
+
size_per_head=cfg.getint(model_name, "size_per_head"),
|
52 |
+
inter_size=cfg.getint(model_name, 'inter_size'),
|
53 |
+
layer_num=cfg.getint(model_name, "num_layer"),
|
54 |
+
rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
|
55 |
+
layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
|
56 |
+
vocab_size=cfg.getint(model_name, "vocab_size"),
|
57 |
+
start_id=cfg.getint(model_name, "start_id"),
|
58 |
+
end_id=cfg.getint(model_name, "end_id"),
|
59 |
+
weights_data_type=cfg.get(model_name, "weight_data_type"),
|
60 |
+
tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
|
61 |
+
inference_data_type=inference_data_type)
|
62 |
+
else:
|
63 |
+
inference_data_type = self.dtype
|
64 |
+
if inference_data_type == None:
|
65 |
+
inference_data_type = LYRA_XVERSE_PARAM.weights_data_type
|
66 |
+
model_args = dict(head_num=LYRA_XVERSE_PARAM.num_heads,
|
67 |
+
size_per_head=LYRA_XVERSE_PARAM.size_per_head,
|
68 |
+
inter_size=LYRA_XVERSE_PARAM.inter_size,
|
69 |
+
layer_num=LYRA_XVERSE_PARAM.num_layers,
|
70 |
+
rotary_embedding_dim=LYRA_XVERSE_PARAM.rotary_embedding,
|
71 |
+
layernorm_eps=LYRA_XVERSE_PARAM.layernorm_eps,
|
72 |
+
vocab_size=LYRA_XVERSE_PARAM.vocab_size,
|
73 |
+
start_id=LYRA_XVERSE_PARAM.start_id or tokenizer.bos_token_id,
|
74 |
+
end_id=LYRA_XVERSE_PARAM.end_id or tokenizer.eos_token_id,
|
75 |
+
weights_data_type=LYRA_XVERSE_PARAM.weights_data_type,
|
76 |
+
tensor_para_size=LYRA_XVERSE_PARAM.tensor_para_size,
|
77 |
+
inference_data_type=inference_data_type)
|
78 |
+
|
79 |
+
# update common parameters
|
80 |
+
|
81 |
+
# Load the C++ model into Pytorch model.
|
82 |
+
sm = "sm80"
|
83 |
+
|
84 |
+
if self.arch == "Ampere":
|
85 |
+
sm = "sm80"
|
86 |
+
elif self.arch == "Volta":
|
87 |
+
sm = "sm70"
|
88 |
+
else:
|
89 |
+
raise Exception(f"unsupported arch: {self.arch}")
|
90 |
+
|
91 |
+
cu = 'cu11'
|
92 |
+
if self.cuda_version == 11:
|
93 |
+
cu = 'cu11'
|
94 |
+
elif self.cuda_version == 12:
|
95 |
+
cu = 'cu12'
|
96 |
+
else:
|
97 |
+
raise Exception(f"unsupported cuda version: {self.cuda_version}")
|
98 |
+
|
99 |
+
lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
|
100 |
+
model_args.update(dict(
|
101 |
+
lib_path=lib_path,
|
102 |
+
model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
|
103 |
+
max_seq_len=0, # for position seq embedding
|
104 |
+
pipeline_para_size=LYRA_XVERSE_PARAM.pipeline_para_size,
|
105 |
+
use_gptj_residual=LYRA_XVERSE_PARAM.use_gptj_residual,
|
106 |
+
memopt_mode=self.memopt_mode
|
107 |
+
))
|
108 |
+
|
109 |
+
print('[FT][INFO] Load Our FT Highly Optimized XVERSE model')
|
110 |
+
for k, v in model_args.items():
|
111 |
+
print(f' - {k.ljust(25, ".")}: {v}')
|
112 |
+
|
113 |
+
# Check sanity and consistency between the model and tokenizer.
|
114 |
+
checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
|
115 |
+
'tensor_para_size', 'tensor_para_size', 'weights_data_type']
|
116 |
+
if None in [model_args[k] for k in checklist]:
|
117 |
+
none_params = [p for p in checklist if model_args[p] is None]
|
118 |
+
print(f'[FT][WARNING] Found None parameters {none_params}. They must '
|
119 |
+
f'be provided either by config file or CLI arguments.')
|
120 |
+
if model_args['start_id'] != tokenizer.bos_token_id:
|
121 |
+
print('[FT][WARNING] Given start_id is not matched with the bos token '
|
122 |
+
'id of the pretrained tokenizer.')
|
123 |
+
if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
|
124 |
+
print('[FT][WARNING] Given end_id is not matched with neither pad '
|
125 |
+
'token id nor eos token id of the pretrained tokenizer.')
|
126 |
+
|
127 |
+
print(f'Loading model from {self.model_path}')
|
128 |
+
model = XVERSEModel(**model_args)
|
129 |
+
return model, tokenizer
|
130 |
+
|
131 |
+
def generate(self, prompts: typing.List[str] | str,
|
132 |
+
output_length: int = 512,
|
133 |
+
beam_width: int = 1,
|
134 |
+
top_k: typing.Optional[torch.IntTensor] = 1,
|
135 |
+
top_p: typing.Optional[torch.FloatTensor] = 1.0,
|
136 |
+
beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
|
137 |
+
temperature: typing.Optional[torch.FloatTensor] = 1.0,
|
138 |
+
len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
|
139 |
+
repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
|
140 |
+
presence_penalty: typing.Optional[torch.FloatTensor] = None,
|
141 |
+
min_length: typing.Optional[torch.IntTensor] = None,
|
142 |
+
bad_words_list: typing.Optional[torch.IntTensor] = None,
|
143 |
+
do_sample: bool = False,
|
144 |
+
return_output_length: bool = False,
|
145 |
+
return_cum_log_probs: int = 0):
|
146 |
+
#
|
147 |
+
if isinstance(prompts, str):
|
148 |
+
prompts = [prompts, ]
|
149 |
+
|
150 |
+
inputs = prompts
|
151 |
+
|
152 |
+
batch_size = len(inputs)
|
153 |
+
ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
|
154 |
+
ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
|
155 |
+
|
156 |
+
# we must encode the raw prompt text one by one in order to compute the length of the original text.
|
157 |
+
input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
|
158 |
+
input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
|
159 |
+
# after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
|
160 |
+
input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
|
161 |
+
|
162 |
+
random_seed = None
|
163 |
+
if do_sample:
|
164 |
+
random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
|
165 |
+
|
166 |
+
outputs = self.model(start_ids=input_token_ids,
|
167 |
+
start_lengths=input_lengths,
|
168 |
+
output_len=output_length,
|
169 |
+
beam_width=beam_width,
|
170 |
+
top_k=top_k * ones_int,
|
171 |
+
top_p=top_p * ones_float,
|
172 |
+
beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
|
173 |
+
temperature=temperature * ones_float,
|
174 |
+
len_penalty=len_penalty * ones_float,
|
175 |
+
repetition_penalty=repetition_penalty * ones_float,
|
176 |
+
random_seed=random_seed,
|
177 |
+
return_output_length=return_output_length,
|
178 |
+
return_cum_log_probs=return_cum_log_probs)
|
179 |
+
|
180 |
+
if return_cum_log_probs > 0:
|
181 |
+
outputs = outputs[0] # output_token_ids.
|
182 |
+
|
183 |
+
# Slice the generated token ids of the 1st beam result.
|
184 |
+
# output = input tokens + generated tokens.
|
185 |
+
output_token_ids = [out[0, length:].cpu()
|
186 |
+
for out, length in zip(outputs, input_lengths)]
|
187 |
+
|
188 |
+
output_texts = self.tokenizer.batch_decode(
|
189 |
+
output_token_ids, skip_special_tokens=True)
|
190 |
+
|
191 |
+
return output_texts
|
lyra_xverse/model.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import print_function
|
16 |
+
|
17 |
+
import copy
|
18 |
+
import os
|
19 |
+
import pathlib
|
20 |
+
import typing
|
21 |
+
|
22 |
+
import numpy as np
|
23 |
+
import torch
|
24 |
+
import torch.distributed as dist
|
25 |
+
import torch.nn as nn
|
26 |
+
|
27 |
+
str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
|
28 |
+
|
29 |
+
class XVERSEModel(nn.Module):
|
30 |
+
def __init__(self,
|
31 |
+
head_num,
|
32 |
+
size_per_head,
|
33 |
+
inter_size,
|
34 |
+
vocab_size,
|
35 |
+
rotary_embedding_dim,
|
36 |
+
start_id, end_id, layer_num,
|
37 |
+
max_seq_len: int,
|
38 |
+
layernorm_eps,
|
39 |
+
tensor_para_size: int,
|
40 |
+
pipeline_para_size: int,
|
41 |
+
use_gptj_residual,
|
42 |
+
lib_path: typing.Union[str, pathlib.Path],
|
43 |
+
model_path,
|
44 |
+
memopt_mode: int = 1,
|
45 |
+
inference_data_type: str = "fp16",
|
46 |
+
weights_data_type: typing.Union[str, np.dtype] = np.float32):
|
47 |
+
super().__init__()
|
48 |
+
self.head_num = head_num
|
49 |
+
self.size_per_head = size_per_head
|
50 |
+
self.inter_size = inter_size
|
51 |
+
self.vocab_size = vocab_size
|
52 |
+
self.rotary_embedding_dim = rotary_embedding_dim
|
53 |
+
self.start_id = start_id
|
54 |
+
self.end_id = end_id
|
55 |
+
self.max_seq_len = max_seq_len
|
56 |
+
self.layer_num = layer_num
|
57 |
+
self.use_gptj_residual = use_gptj_residual
|
58 |
+
self.layernorm_eps = layernorm_eps
|
59 |
+
self.memopt_mode = memopt_mode
|
60 |
+
|
61 |
+
# multi-gpu params
|
62 |
+
self.tensor_para_size = tensor_para_size
|
63 |
+
self.pipeline_para_size = pipeline_para_size
|
64 |
+
self.build_model = False
|
65 |
+
self.weights_data_type = weights_data_type
|
66 |
+
self.inference_data_type = inference_data_type
|
67 |
+
|
68 |
+
assert torch.cuda.is_available(), "CUDA is required for this model."
|
69 |
+
|
70 |
+
assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
|
71 |
+
assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
|
72 |
+
|
73 |
+
# Load the C++ model into Pytorch model.
|
74 |
+
torch.classes.load_library(os.path.abspath(lib_path))
|
75 |
+
|
76 |
+
# Prepare for tensor/pipeline parallel
|
77 |
+
try:
|
78 |
+
dist.init_process_group(backend='mpi')
|
79 |
+
except:
|
80 |
+
print("[INFO] WARNING: Have initialized the process group")
|
81 |
+
self.rank = dist.get_rank()
|
82 |
+
self.device_count = torch.cuda.device_count()
|
83 |
+
self.device = self.rank % self.device_count
|
84 |
+
torch.cuda.set_device(self.device)
|
85 |
+
|
86 |
+
world_size = dist.get_world_size()
|
87 |
+
# print(tensor_para_size * pipeline_para_size)
|
88 |
+
assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
|
89 |
+
|
90 |
+
self.tensor_para_rank = self.rank % self.tensor_para_size
|
91 |
+
self.pipeline_para_rank = self.rank // self.tensor_para_size
|
92 |
+
|
93 |
+
self.model = torch.classes.FasterTransformer.LlamaOp(
|
94 |
+
self.head_num, self.size_per_head, self.inter_size,
|
95 |
+
self.layer_num,
|
96 |
+
self.vocab_size,
|
97 |
+
self.rotary_embedding_dim,
|
98 |
+
self.layernorm_eps,
|
99 |
+
self.start_id, self.end_id,
|
100 |
+
self.tensor_para_size, self.pipeline_para_size,
|
101 |
+
self.max_seq_len,
|
102 |
+
self.use_gptj_residual,
|
103 |
+
self.memopt_mode,
|
104 |
+
model_path,
|
105 |
+
self.weights_data_type,
|
106 |
+
self.inference_data_type)
|
107 |
+
|
108 |
+
self.build_model = True
|
109 |
+
torch.cuda.empty_cache()
|
110 |
+
|
111 |
+
def forward(self,
|
112 |
+
start_ids: torch.Tensor,
|
113 |
+
start_lengths: torch.Tensor,
|
114 |
+
output_len,
|
115 |
+
beam_width=1,
|
116 |
+
top_k: torch.Tensor = None,
|
117 |
+
top_p: torch.Tensor = None,
|
118 |
+
beam_search_diversity_rate: torch.Tensor = None,
|
119 |
+
temperature: torch.Tensor = None,
|
120 |
+
len_penalty: torch.Tensor = None,
|
121 |
+
repetition_penalty: torch.Tensor = None,
|
122 |
+
random_seed: torch.Tensor = None,
|
123 |
+
return_output_length=False,
|
124 |
+
return_cum_log_probs=0):
|
125 |
+
|
126 |
+
input_len = start_ids.size(1)
|
127 |
+
assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
|
128 |
+
|
129 |
+
# Inputs to device
|
130 |
+
input_ids = start_ids.cuda(self.device)
|
131 |
+
input_lengths = start_lengths.cuda(self.device)
|
132 |
+
# outputs: output_ids, output_lengths, output_cum_log_probs (optional)
|
133 |
+
outputs = self.model.forward(input_ids,
|
134 |
+
input_lengths,
|
135 |
+
output_len,
|
136 |
+
beam_width, # optional, can be None
|
137 |
+
top_k, # optional, can be None
|
138 |
+
top_p, # optional, can be None
|
139 |
+
beam_search_diversity_rate, # optional, can be None
|
140 |
+
temperature, # optional, can be None
|
141 |
+
len_penalty, # optional, can be None
|
142 |
+
repetition_penalty, # optional, can be None
|
143 |
+
random_seed, # optional, can be None
|
144 |
+
return_cum_log_probs) # optional, can be None
|
145 |
+
|
146 |
+
if return_cum_log_probs == 0:
|
147 |
+
output_ids, output_lengths = outputs
|
148 |
+
else:
|
149 |
+
output_ids, output_lengths, output_cum_log_probs = outputs
|
150 |
+
if return_output_length:
|
151 |
+
if return_cum_log_probs > 0:
|
152 |
+
return output_ids, output_lengths, output_cum_log_probs
|
153 |
+
else:
|
154 |
+
return output_ids, output_lengths
|
155 |
+
else:
|
156 |
+
return output_ids
|
157 |
+
|
158 |
+
def set_input_tensor(self, input_tensor):
|
159 |
+
"""Set input tensor to be used instead of forward()'s input.
|
160 |
+
|
161 |
+
When doing pipeline parallelism the input from the previous
|
162 |
+
stage comes from communication, not from the input, so the
|
163 |
+
model's forward_step_func won't have it. This function is thus
|
164 |
+
used by internal code to bypass the input provided by the
|
165 |
+
forward_step_func"""
|
166 |
+
self.input_tensor = input_tensor
|
models/config.ini
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[llama]
|
2 |
+
model_name = XVERSE
|
3 |
+
head_num = 40
|
4 |
+
size_per_head = 128
|
5 |
+
inter_size = 13824
|
6 |
+
num_layer = 40
|
7 |
+
rotary_embedding = 128
|
8 |
+
layernorm_eps = 1e-06
|
9 |
+
vocab_size = 100278
|
10 |
+
start_id = 2
|
11 |
+
end_id = 3
|
12 |
+
tensor_para_size = 1
|
13 |
+
weight_data_type = fp32
|
14 |
+
|
models/special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<pad>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
models/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/tokenizer_config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
+
"model_max_length": 1000000000000000019884624838656,
|
4 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
5 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
numpy
|
3 |
+
setuptools
|
4 |
+
torch
|
5 |
+
sentencepiece
|