add files

Browse files

Files changed (14) hide show

.gitignore +165 -0
chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx +3 -0
chatglm-6b-int8-onnx-merged/model_weights_0.bin +3 -0
chatglm-6b-int8-onnx-merged/model_weights_1.bin +3 -0
chatglm-6b-int8-onnx-merged/model_weights_2.bin +3 -0
chatglm-6b-int8-onnx-merged/model_weights_3.bin +3 -0
chatglm-6b-int8-onnx-merged/model_weights_4.bin +3 -0
chatglm-6b-int8-onnx-merged/model_weights_5.bin +3 -0
chatglm-6b-int8-onnx-merged/model_weights_6.bin +3 -0
chatglm-6b-int8-onnx-merged/sentencepiece.model +3 -0
model.py +125 -0
requirements.txt +5 -0
tokenizer.py +75 -0
web-ui.py +74 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,165 @@

+# Project ignores
+models/
+scripts/
+data/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93c988ddb30e2eb97aafe05fd8086f56faec47e8488bc2bb6dbd19ee50ce36ae
+size 459821

chatglm-6b-int8-onnx-merged/model_weights_0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:721f5497129c8f2bbffe685892a99bdc87e00fd29b70d54d5f75df8810811cf1
+size 1069807488

chatglm-6b-int8-onnx-merged/model_weights_1.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:320f96165f0ba496292eb4dd35979d5fb5c0bbfc0fbaf83b0e8150a9959d4c8d
+size 948125696

chatglm-6b-int8-onnx-merged/model_weights_2.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92bc601207b27b08803e223b6a414eb533d3f4eeab26ed9c3b75ca4b0b977f41
+size 1006960640

chatglm-6b-int8-onnx-merged/model_weights_3.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26218891b8d13a8c3b3b5cc15b47c6ba1b5b140a614cd9a5ffb95a69e5180025
+size 1006960640

chatglm-6b-int8-onnx-merged/model_weights_4.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22f6b5087d50d39c566079a8677c1e1ef41e3b16763f4d022e00d385d4dc88af
+size 1006960640

chatglm-6b-int8-onnx-merged/model_weights_5.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5c6502fdf30878a5e75be2da7e2e134e5bfe3a132b1e98880880687cce1e703
+size 1006960640

chatglm-6b-int8-onnx-merged/model_weights_6.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82b140850685302b6939fca378a4174246304c4afb7b58b26aaecad370d2a15a
+size 671842304

chatglm-6b-int8-onnx-merged/sentencepiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e974d9a69c242ce014c88c2b26089270f6198f3c0b700a887666cd3e816f17e
+size 2706249

model.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import re
+import numpy as np
+from tokenizer import ChatGLMTokenizer
+# import torch
+from onnxruntime import InferenceSession, SessionOptions
+# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
+# although they are documented as supported on CUDA.
+providers = ["CPUExecutionProvider"]
+# if torch.cuda.is_available():
+#     providers = ["CUDAExecutionProvider"] + providers
+# Default paths
+tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
+onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
+# input & output names
+past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+output_names = ["logits"] + present_names
+# default kv_cache for first inference
+default_past_key_values = {
+    k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
+}
+def chat_template(history: list[tuple[str, str]], current: str):
+    prompt = ""
+    chat_round = 0
+    for question, answer in history:
+        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
+        chat_round += 1
+    prompt += f"[Round {chat_round}]\n问：{current}\n答："
+    return prompt
+def process_response(response: str):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+class ChatGLMModel():
+    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
+        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
+        options = SessionOptions()
+        options.enable_profiling = profile
+        self.session = InferenceSession(onnx_model_path, options, providers=providers)
+        self.eop_token_id = self.tokenizer["<eop>"]
+    def prepare_input(self, prompt: str):
+        input_ids, prefix_mask = self.tokenizer.encode(prompt)
+        input_ids = np.array([input_ids], dtype=np.longlong)
+        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
+        return input_ids, prefix_mask, default_past_key_values
+    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
+        # softmax with temperature
+        exp_logits = np.exp(logits / temperature)
+        probs = exp_logits / np.sum(exp_logits)
+        # top k
+        top_k_idx = np.argsort(-probs)[:top_k]
+        top_k_probs = probs[top_k_idx]
+        # top p
+        cumsum_probs = np.cumsum(top_k_probs)
+        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
+        top_k_probs = top_k_probs / np.sum(top_k_probs)
+        # sample
+        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
+        return next_token[0].item()
+    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
+        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
+        output_tokens = []
+        while True:
+            inputs = {
+                "input_ids": input_ids,
+                "prefix_mask": prefix_mask,
+                "use_past": np.array(len(output_tokens) > 0),
+            }
+            inputs.update(past_key_values)
+            logits, *past_key_values = self.session.run(output_names, inputs)
+            past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
+            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
+            output_tokens += [next_token]
+            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
+                break
+            input_ids = np.array([[next_token]], dtype=np.longlong)
+            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
+            yield process_response(self.tokenizer.decode(output_tokens))
+        return process_response(self.tokenizer.decode(output_tokens))

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy
+onnxruntime
+sentencepiece
+streamlit
+streamlit-chat

tokenizer.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import re
+from sentencepiece import SentencePieceProcessor
+def replace_spaces_with_blank(match: re.Match[str]):
+    return f"<|blank_{len(match.group())}|>"
+def replace_blank_with_spaces(match: re.Match[str]):
+    return " " * int(match.group(1))
+class ChatGLMTokenizer:
+    def __init__(self, vocab_file):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
+    def __len__(self):
+        return len(self.text_tokenizer)
+    def __getitem__(self, key: str):
+        return self.text_tokenizer[key]
+    def preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = text.replace("\t", "<|tab|>")
+            text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
+        return text
+    def encode(
+        self, text: str, text_pair: str = None,
+        linebreak=True, whitespaces=True,
+        add_dummy_prefix=True, special_tokens=True,
+    ) -> tuple[list[int], list[int]]:
+        """
+        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
+        text_pair: causal LM part.
+        linebreak: Whether to encode newline (\n) in text.
+        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self.preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tokens = self.text_tokenizer.encode(text)
+        prefix_mask = [1] * len(tokens)
+        if special_tokens:
+            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
+            prefix_mask += [1, 0]
+        if text_pair is not None:
+            pair_tokens = self.text_tokenizer.encode(text_pair)
+            tokens += pair_tokens
+            prefix_mask += [0] * len(pair_tokens)
+            if special_tokens:
+                tokens += [self.text_tokenizer["<eop>"]]
+                prefix_mask += [0]
+        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
+    def decode(self, text_ids: list[int]) -> str:
+        text = self.text_tokenizer.decode(text_ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace("<|tab|>", "\t")
+        text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
+        return text

web-ui.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import streamlit as st
+from streamlit_chat import message
+from model import ChatGLMModel, chat_template
+# page state
+@st.cache_resource
+def create_model():
+    return ChatGLMModel()
+with st.spinner("加载模型中..."):
+    model = create_model()
+if "history" not in st.session_state:
+    st.session_state["history"] = []
+# parameters
+with st.sidebar:
+    st.markdown("## 采样参数")
+    max_tokens = st.number_input("max_tokens", min_value=1, max_value=500, value=200)
+    temperature = st.number_input("temperature", min_value=0.1, max_value=4.0, value=1.0)
+    top_p = st.number_input("top_p", min_value=0.1, max_value=1.0, value=0.7)
+    top_k = st.number_input("top_k", min_value=1, max_value=500, value=50)
+    if st.button("清空上下文"):
+        st.session_state.message = ""
+        st.session_state.history = []
+    st.markdown("""
+    [ChatGLM](https://huggingface.co/THUDM/chatglm-6b) + [ONNXRuntime](https://onnxruntime.ai/)
+    """)
+# main body
+st.markdown("## ChatGLM + ONNXRuntime")
+history: list[tuple[str, str]] = st.session_state.history
+if len(history) == 0:
+    st.caption("请在下方输入消息开始会话")
+for idx, (question, answer) in enumerate(history):
+    message(question, is_user=True, key=f"history_question_{idx}")
+    message(answer, key=f"history_answer_{idx}")
+next_answer = st.container()
+question = st.text_area(label="消息", key="message")
+if st.button("发送") and len(question.strip()):
+    with next_answer:
+        message(question, is_user=True, key="message_question")
+        with st.spinner("正在回复中"):
+            with st.empty():
+                prompt = chat_template(history, question)
+                for answer in model.generate_iterate(
+                    prompt,
+                    max_generated_tokens=max_tokens,
+                    top_k=top_k,
+                    top_p=top_p,
+                    temperature=temperature,
+                ):
+                    st.write(answer)
+                message(answer, key="message_answer")
+    st.session_state.history = history + [(question, answer)]