init

Browse files

Files changed (12) hide show

.gitignore +6 -0
README.md +76 -0
demo.py +26 -0
lyra_xverse/__init__.py +1 -0
lyra_xverse/config.py +32 -0
lyra_xverse/lyra_xverse.py +191 -0
lyra_xverse/model.py +166 -0
models/config.ini +14 -0
models/special_tokens_map.json +23 -0
models/tokenizer.json +0 -0
models/tokenizer_config.json +5 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+dist/
+*.egg-info/
+__pycache__
+build/
+.vscode
+.idea

README.md CHANGED Viewed

@@ -1,3 +1,79 @@
 ---
 license: mit
 ---

 ---
 license: mit
+language: en
+tags:
+- LLM
+- XVERSE-13B-Chat
 ---
+## Model Card for lyraXVERSE
+## Speed
+* Evaluated at tokens/s
+* test on A100 40G
+* MEMOPT mode
+### XVERSE-13B-Chat
+## Docker Environment Recommendation
+- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
+- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
+```bash
+docker pull nvcr.io/nvidia/pytorch:23.02-py3
+docker run --rm -it --gpus all -v ./:/lyraXVERSE nvcr.io/nvidia/pytorch:23.02-py3
+pip install -r requirements.txt
+python demo.py
+```
+## Uses
+```python
+from lyra_xverse import lyraXVERSE
+model_path = "./models/"
+tokenizer_path = "./models/"
+inference_dtype = 'fp16'
+prompt = "讲个故事:"
+memopt_mode = 1
+max_output_length = 512
+arch = "Ampere" # Ampere or Volta
+cuda_version = 12 # cuda version, we currently support 11 and 12
+model = lyraXVERSE(model_path,
+                   tokenizer_path = tokenizer_path,
+                   dtype = inference_dtype,
+                   memopt_mode = memopt_mode,
+                   arch = arch,
+                   cuda_version = cuda_version)
+```
+## Demo Outputs
+### XVERSE-13B-Chat
+#### input
+讲个故事:
+#### output
+有一天,一位年轻的画家来到了一个偏远的村庄。他以其超凡的绘画技巧,为村民画了一幅美丽的图画。图画里,村庄的周围是翠绿的森林,清澈的溪流在其中流淌,村民们正在劳作,孩子们在田野里嬉戏。村民们看着这幅画,都对这位画家赞不绝口。\n\n村庄的领袖看到了这幅画,他想:“这幅画将会让我们的村庄更加美丽,我们应该让村民们知道这幅画。”于是,他带着画家去村庄的各个角落,让每一个村民都看到了这幅画。\n\n画家看着村民们看画的眼神,他意识到了自己的价值。他意识到,他不仅仅是一个画家,他也是一个能让人们看见希望的人。他的画不仅仅是艺术品,它是连接人们与希望的一座桥梁。\n\n这个故事告诉我们,画家的价值不只是他们的绘画技巧,而是他们的画作带给人们的感动和希望。画家的价值并不在于他们的画有多么昂贵,有多么独特,而在于他们能用画作打开人们的心扉,让人们看见希望,看见生活的美好。
+## TODO
+## Citation
+``` bibtex
+@Misc{lyraXVERSE2023,
+  author =       {Kangjian Wu, Zhengtao Wang, Yibo Lu, Haoxiong Su, Bin Wu},
+  title =        {lyraXVERSE: Accelerating XVERSE-13B-Chat(fp16) to 3000+ tokens/s},
+  howpublished = {\url{https://huggingface.co/TMElyralab/lyraXVERSE}},
+  year =         {2023}
+}
+```
+## Report bug
+- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraLLaMA/discussions
+- report bug with a `[bug]` mark in the title.

demo.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from lyra_xverse import lyraXVERSE
+model_path = "./models/"
+tokenizer_path = "./models/"
+inference_dtype = 'fp16'
+prompt = "讲个故事:"
+memopt_mode = 1
+max_output_length = 512
+arch = "Ampere" # Ampere or Volta
+cuda_version = 12 # cuda version, we currently support 11 and 12
+model = lyraXVERSE(model_path,
+                   tokenizer_path = tokenizer_path,
+                   dtype = inference_dtype,
+                   memopt_mode = memopt_mode,
+                   arch = arch,
+                   cuda_version = cuda_version)
+bs = 1
+prompts = [prompt, ] * bs
+output_texts = model.generate(
+        prompts, output_length=max_output_length,
+        top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
+print(output_texts)

lyra_xverse/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .lyra_xverse import lyraXVERSE

lyra_xverse/config.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import dataclasses
+import pathlib
+from typing import Optional
+@dataclasses.dataclass
+class LyraXVERSEParam:
+    num_heads: int = 40
+    size_per_head: int = 128
+    inter_size: int = 13824
+    num_layers: int = 40
+    vocab_size: int = 39424
+    start_id: Optional[int] = 1
+    end_id: Optional[int] = 2
+    tensor_para_size: int = 1
+    pipeline_para_size: int = 1
+    remove_padding: bool = True
+    shared_contexts_ratio: float = 1.0
+    layernorm_eps: float = 1e-6
+    weights_data_type: str = "fp16"
+    rotary_embedding: int = 128
+    use_gptj_residual: bool = False
+    def __post_init__(self):
+        if not 0.0 <= self.shared_contexts_ratio <= 1.0:
+            raise ValueError(
+                f'Got an invalid value of shared_context_ratio '
+                f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
+    def asdict(self):
+        return dataclasses.asdict(self)
+LYRA_XVERSE_PARAM = LyraXVERSEParam()

lyra_xverse/lyra_xverse.py ADDED Viewed

	@@ -0,0 +1,191 @@

+from __future__ import annotations
+import configparser
+import pathlib
+import typing
+import os
+import torch
+import transformers
+from torch.nn.utils.rnn import pad_sequence
+from .config import LYRA_XVERSE_PARAM
+from .model import XVERSEModel
+class lyraXVERSE:
+    def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=1, arch='Ampere', cuda_version=12) -> None:
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+        self.dtype = dtype
+        self.memopt_mode = memopt_mode
+        self.arch = arch
+        self.cuda_version = cuda_version
+        self.model, self.tokenizer = self.load_model_and_tokenizer()
+        print("Got model and tokenizer")
+    def load_model_and_tokenizer(self):
+        if self.tokenizer_path is None:
+            tokenizer_path = self.model_path
+        else:
+            tokenizer_path = self.tokenizer_path
+        print(f'Loading tokenizer from {tokenizer_path}')
+        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
+        checkpoint_path = pathlib.Path(self.model_path)
+        config_path = checkpoint_path / 'config.ini'
+        if config_path.exists():
+            # Read model params from config.
+            cfg = configparser.ConfigParser()
+            cfg.read(config_path)
+            model_name = 'llama'
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = cfg.get(model_name, "weight_data_type")
+            model_args = dict(
+                head_num=cfg.getint(model_name, 'head_num'),
+                size_per_head=cfg.getint(model_name, "size_per_head"),
+                inter_size=cfg.getint(model_name, 'inter_size'),
+                layer_num=cfg.getint(model_name, "num_layer"),
+                rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
+                layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
+                vocab_size=cfg.getint(model_name, "vocab_size"),
+                start_id=cfg.getint(model_name, "start_id"),
+                end_id=cfg.getint(model_name, "end_id"),
+                weights_data_type=cfg.get(model_name, "weight_data_type"),
+                tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
+                inference_data_type=inference_data_type)
+        else:
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = LYRA_XVERSE_PARAM.weights_data_type
+            model_args = dict(head_num=LYRA_XVERSE_PARAM.num_heads,
+                              size_per_head=LYRA_XVERSE_PARAM.size_per_head,
+                              inter_size=LYRA_XVERSE_PARAM.inter_size,
+                              layer_num=LYRA_XVERSE_PARAM.num_layers,
+                              rotary_embedding_dim=LYRA_XVERSE_PARAM.rotary_embedding,
+                              layernorm_eps=LYRA_XVERSE_PARAM.layernorm_eps,
+                              vocab_size=LYRA_XVERSE_PARAM.vocab_size,
+                              start_id=LYRA_XVERSE_PARAM.start_id or tokenizer.bos_token_id,
+                              end_id=LYRA_XVERSE_PARAM.end_id or tokenizer.eos_token_id,
+                              weights_data_type=LYRA_XVERSE_PARAM.weights_data_type,
+                              tensor_para_size=LYRA_XVERSE_PARAM.tensor_para_size,
+                              inference_data_type=inference_data_type)
+        # update common parameters
+        # Load the C++ model into Pytorch model.
+        sm = "sm80"
+        if self.arch == "Ampere":
+            sm = "sm80"
+        elif self.arch == "Volta":
+            sm = "sm70"
+        else:
+            raise Exception(f"unsupported arch: {self.arch}")
+        cu = 'cu11'
+        if self.cuda_version == 11:
+            cu = 'cu11'
+        elif self.cuda_version == 12:
+            cu = 'cu12'
+        else:
+            raise Exception(f"unsupported cuda version: {self.cuda_version}")
+        lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
+        model_args.update(dict(
+            lib_path=lib_path,
+            model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
+            max_seq_len=0,  # for position seq embedding
+            pipeline_para_size=LYRA_XVERSE_PARAM.pipeline_para_size,
+            use_gptj_residual=LYRA_XVERSE_PARAM.use_gptj_residual,
+            memopt_mode=self.memopt_mode
+        ))
+        print('[FT][INFO] Load Our FT Highly Optimized XVERSE model')
+        for k, v in model_args.items():
+            print(f' - {k.ljust(25, ".")}: {v}')
+        # Check sanity and consistency between the model and tokenizer.
+        checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
+                     'tensor_para_size', 'tensor_para_size', 'weights_data_type']
+        if None in [model_args[k] for k in checklist]:
+            none_params = [p for p in checklist if model_args[p] is None]
+            print(f'[FT][WARNING] Found None parameters {none_params}. They must '
+                  f'be provided either by config file or CLI arguments.')
+        if model_args['start_id'] != tokenizer.bos_token_id:
+            print('[FT][WARNING] Given start_id is not matched with the bos token '
+                  'id of the pretrained tokenizer.')
+        if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
+            print('[FT][WARNING] Given end_id is not matched with neither pad '
+                  'token id nor eos token id of the pretrained tokenizer.')
+        print(f'Loading model from {self.model_path}')
+        model = XVERSEModel(**model_args)
+        return model, tokenizer
+    def generate(self, prompts: typing.List[str] | str,
+                 output_length: int = 512,
+                 beam_width: int = 1,
+                 top_k: typing.Optional[torch.IntTensor] = 1,
+                 top_p: typing.Optional[torch.FloatTensor] = 1.0,
+                 beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
+                 temperature: typing.Optional[torch.FloatTensor] = 1.0,
+                 len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
+                 repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
+                 presence_penalty: typing.Optional[torch.FloatTensor] = None,
+                 min_length: typing.Optional[torch.IntTensor] = None,
+                 bad_words_list: typing.Optional[torch.IntTensor] = None,
+                 do_sample: bool = False,
+                 return_output_length: bool = False,
+                 return_cum_log_probs: int = 0):
+        #
+        if isinstance(prompts, str):
+            prompts = [prompts, ]
+        inputs = prompts
+        batch_size = len(inputs)
+        ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
+        ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
+        # we must encode the raw prompt text one by one in order to compute the length of the original text.
+        input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
+        input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
+        # after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
+        input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
+        random_seed = None
+        if do_sample:
+            random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
+        outputs = self.model(start_ids=input_token_ids,
+                             start_lengths=input_lengths,
+                             output_len=output_length,
+                             beam_width=beam_width,
+                             top_k=top_k * ones_int,
+                             top_p=top_p * ones_float,
+                             beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
+                             temperature=temperature * ones_float,
+                             len_penalty=len_penalty * ones_float,
+                             repetition_penalty=repetition_penalty * ones_float,
+                             random_seed=random_seed,
+                             return_output_length=return_output_length,
+                             return_cum_log_probs=return_cum_log_probs)
+        if return_cum_log_probs > 0:
+            outputs = outputs[0]  # output_token_ids.
+        # Slice the generated token ids of the 1st beam result.
+        # output = input tokens + generated tokens.
+        output_token_ids = [out[0, length:].cpu()
+                            for out, length in zip(outputs, input_lengths)]
+        output_texts = self.tokenizer.batch_decode(
+            output_token_ids, skip_special_tokens=True)
+        return output_texts

lyra_xverse/model.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import copy
+import os
+import pathlib
+import typing
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
+class XVERSEModel(nn.Module):
+    def __init__(self,
+                 head_num,
+                 size_per_head,
+                 inter_size,
+                 vocab_size,
+                 rotary_embedding_dim,
+                 start_id, end_id, layer_num,
+                 max_seq_len: int,
+                 layernorm_eps,
+                 tensor_para_size: int,
+                 pipeline_para_size: int,
+                 use_gptj_residual,
+                 lib_path: typing.Union[str, pathlib.Path],
+                 model_path,
+                 memopt_mode: int = 1,
+                 inference_data_type: str = "fp16",
+                 weights_data_type: typing.Union[str, np.dtype] = np.float32):
+        super().__init__()
+        self.head_num = head_num
+        self.size_per_head = size_per_head
+        self.inter_size = inter_size
+        self.vocab_size = vocab_size
+        self.rotary_embedding_dim = rotary_embedding_dim
+        self.start_id = start_id
+        self.end_id = end_id
+        self.max_seq_len = max_seq_len
+        self.layer_num = layer_num
+        self.use_gptj_residual = use_gptj_residual
+        self.layernorm_eps = layernorm_eps
+        self.memopt_mode = memopt_mode
+        # multi-gpu params
+        self.tensor_para_size = tensor_para_size
+        self.pipeline_para_size = pipeline_para_size
+        self.build_model = False
+        self.weights_data_type = weights_data_type
+        self.inference_data_type = inference_data_type
+        assert torch.cuda.is_available(), "CUDA is required for this model."
+        assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
+        assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
+        # Load the C++ model into Pytorch model.
+        torch.classes.load_library(os.path.abspath(lib_path))
+        # Prepare for tensor/pipeline parallel
+        try:
+            dist.init_process_group(backend='mpi')
+        except:
+            print("[INFO] WARNING: Have initialized the process group")
+        self.rank = dist.get_rank()
+        self.device_count = torch.cuda.device_count()
+        self.device = self.rank % self.device_count
+        torch.cuda.set_device(self.device)
+        world_size = dist.get_world_size()
+        # print(tensor_para_size * pipeline_para_size)
+        assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
+        self.tensor_para_rank = self.rank % self.tensor_para_size
+        self.pipeline_para_rank = self.rank // self.tensor_para_size
+        self.model = torch.classes.FasterTransformer.LlamaOp(
+            self.head_num, self.size_per_head, self.inter_size,
+            self.layer_num,
+            self.vocab_size,
+            self.rotary_embedding_dim,
+            self.layernorm_eps,
+            self.start_id, self.end_id,
+            self.tensor_para_size, self.pipeline_para_size,
+            self.max_seq_len,
+            self.use_gptj_residual,
+            self.memopt_mode,
+            model_path,
+            self.weights_data_type,
+            self.inference_data_type)
+        self.build_model = True
+        torch.cuda.empty_cache()
+    def forward(self,
+                start_ids: torch.Tensor,
+                start_lengths: torch.Tensor,
+                output_len,
+                beam_width=1,
+                top_k: torch.Tensor = None,
+                top_p: torch.Tensor = None,
+                beam_search_diversity_rate: torch.Tensor = None,
+                temperature: torch.Tensor = None,
+                len_penalty: torch.Tensor = None,
+                repetition_penalty: torch.Tensor = None,
+                random_seed: torch.Tensor = None,
+                return_output_length=False,
+                return_cum_log_probs=0):
+        input_len = start_ids.size(1)
+        assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
+        # Inputs to device
+        input_ids = start_ids.cuda(self.device)
+        input_lengths = start_lengths.cuda(self.device)
+        # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
+        outputs = self.model.forward(input_ids,
+                                     input_lengths,
+                                     output_len,
+                                     beam_width,  # optional, can be None
+                                     top_k,  # optional, can be None
+                                     top_p,  # optional, can be None
+                                     beam_search_diversity_rate,  # optional, can be None
+                                     temperature,  # optional, can be None
+                                     len_penalty,  # optional, can be None
+                                     repetition_penalty,  # optional, can be None
+                                     random_seed,  # optional, can be None
+                                     return_cum_log_probs)  # optional, can be None
+        if return_cum_log_probs == 0:
+            output_ids, output_lengths = outputs
+        else:
+            output_ids, output_lengths, output_cum_log_probs = outputs
+        if return_output_length:
+            if return_cum_log_probs > 0:
+                return output_ids, output_lengths, output_cum_log_probs
+            else:
+                return output_ids, output_lengths
+        else:
+            return output_ids
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor

models/config.ini ADDED Viewed

	@@ -0,0 +1,14 @@

+[llama]
+model_name = XVERSE
+head_num = 40
+size_per_head = 128
+inter_size = 13824
+num_layer = 40
+rotary_embedding = 128
+layernorm_eps = 1e-06
+vocab_size = 100278
+start_id = 2
+end_id = 3
+tensor_para_size = 1
+weight_data_type = fp32

models/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "bos_token": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    "eos_token": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    "pad_token": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  }

models/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+numpy
+setuptools
+torch
+sentencepiece