Spaces:
Running
on
Zero
Running
on
Zero
add tool call
Browse files
app.py
CHANGED
@@ -1,51 +1,126 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
description = """
|
8 |
-
Minitron is a family of small language models (SLMs) obtained by pruning [NVIDIA's](https://huggingface.co/nvidia) Nemotron-4 15B model. We prune model embedding size, attention heads, and MLP intermediate dimension, following which, we perform continued training with distillation to arrive at the final models.
|
9 |
-
### Join us :
|
10 |
-
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [BuildTonic](https://github.com/buildtonic/)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
|
11 |
-
"""
|
12 |
|
13 |
-
|
14 |
-
model_path = "nvidia/Minitron-8B-Base"
|
15 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype, device_map=device)
|
20 |
-
|
21 |
-
# Define the prompt format
|
22 |
-
def create_prompt(instruction):
|
23 |
-
PROMPT = '''Below is an instruction that describes a task.\n\nWrite a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:'''
|
24 |
-
return PROMPT.format(instruction=instruction)
|
25 |
-
|
26 |
-
@spaces.GPU
|
27 |
-
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
28 |
-
prompt = create_prompt(message)
|
29 |
-
|
30 |
-
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
|
31 |
-
|
32 |
-
output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1)
|
33 |
-
|
34 |
-
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
35 |
-
|
36 |
-
return output_text
|
37 |
-
|
38 |
-
demo = gr.ChatInterface(
|
39 |
-
title=gr.Markdown(title),
|
40 |
-
# description=gr.Markdown(description),
|
41 |
-
fn=respond,
|
42 |
-
additional_inputs=[
|
43 |
-
gr.Textbox(value="You are Minitron an AI assistant created by Tonic-AI", label="System message"),
|
44 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
45 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
46 |
-
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
|
47 |
-
],
|
48 |
-
)
|
49 |
-
|
50 |
-
if __name__ == "__main__":
|
51 |
-
demo.launch()
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
4 |
+
import json
|
5 |
+
from globe import title, description, customtool
|
6 |
+
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
model_path = "nvidia/Nemotron-Mini-4B-Instruct"
|
|
|
9 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
10 |
+
model = AutoModelForCausalLM.from_pretrained(model_path)
|
11 |
+
|
12 |
+
# Create a pipeline
|
13 |
+
pipe = pipeline("text-generation", model=model_path)
|
14 |
+
pipe.tokenizer = tokenizer # Assign tokenizer manually
|
15 |
+
|
16 |
+
def create_prompt(system_message, user_message, tool_definition=""):
|
17 |
+
if tool_definition:
|
18 |
+
return f"""<extra_id_0>System
|
19 |
+
{system_message}
|
20 |
+
|
21 |
+
<tool>
|
22 |
+
{tool_definition}
|
23 |
+
</tool>
|
24 |
+
<context>
|
25 |
+
The current date is 2023-06-01.
|
26 |
+
</context>
|
27 |
+
|
28 |
+
<extra_id_1>User
|
29 |
+
{user_message}
|
30 |
+
<extra_id_1>Assistant
|
31 |
+
"""
|
32 |
+
else:
|
33 |
+
return f"<extra_id_0>System\n{system_message}\n\n<extra_id_1>User\n{user_message}\n<extra_id_1>Assistant\n"
|
34 |
+
|
35 |
+
def generate_response(message, history, system_message, max_tokens, temperature, top_p, use_pipeline=False, tool_definition=""):
|
36 |
+
full_prompt = create_prompt(system_message, message, tool_definition)
|
37 |
+
|
38 |
+
if use_pipeline:
|
39 |
+
messages = [
|
40 |
+
{"role": "system", "content": system_message},
|
41 |
+
{"role": "user", "content": message},
|
42 |
+
]
|
43 |
+
response = pipe(messages, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p)[0]['generated_text']
|
44 |
+
else:
|
45 |
+
tokenized_chat = tokenizer.apply_chat_template(
|
46 |
+
[
|
47 |
+
{"role": "system", "content": system_message},
|
48 |
+
{"role": "user", "content": message},
|
49 |
+
],
|
50 |
+
tokenize=True,
|
51 |
+
add_generation_prompt=True,
|
52 |
+
return_tensors="pt"
|
53 |
+
)
|
54 |
+
|
55 |
+
with torch.no_grad():
|
56 |
+
output_ids = model.generate(
|
57 |
+
tokenized_chat,
|
58 |
+
max_new_tokens=max_tokens,
|
59 |
+
temperature=temperature,
|
60 |
+
top_p=top_p,
|
61 |
+
do_sample=True
|
62 |
+
)
|
63 |
+
|
64 |
+
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
65 |
+
|
66 |
+
assistant_response = response.split("<extra_id_1>Assistant\n")[-1].strip()
|
67 |
+
|
68 |
+
if tool_definition and "<toolcall>" in assistant_response:
|
69 |
+
tool_call = assistant_response.split("<toolcall>")[1].split("</toolcall>")[0]
|
70 |
+
assistant_response += f"\n\nTool Call: {tool_call}\n\nNote: This is a simulated tool call. In a real scenario, the tool would be executed and its output would be used to generate a final response."
|
71 |
+
|
72 |
+
return assistant_response
|
73 |
+
|
74 |
+
with gr.Blocks() as demo:
|
75 |
+
gr.Markdown("# 🤖 Nemotron-Mini-4B-Instruct Demo with Custom Function Calling")
|
76 |
+
gr.Markdown("This demo showcases the Nemotron-Mini-4B-Instruct model from NVIDIA, including optional custom function calling.")
|
77 |
+
|
78 |
+
with gr.Row():
|
79 |
+
with gr.Column(scale=3):
|
80 |
+
chatbot = gr.Chatbot(height=400)
|
81 |
+
msg = gr.Textbox(label="User Input", placeholder="Ask a question or request a task...")
|
82 |
+
clear = gr.Button("Clear")
|
83 |
+
|
84 |
+
with gr.Column(scale=2):
|
85 |
+
system_message = gr.Textbox(
|
86 |
+
label="System Message",
|
87 |
+
value="You are a helpful AI assistant.",
|
88 |
+
lines=2,
|
89 |
+
placeholder="Set the AI's behavior and context..."
|
90 |
+
)
|
91 |
+
max_tokens = gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max Tokens")
|
92 |
+
temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
|
93 |
+
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
|
94 |
+
use_pipeline = gr.Checkbox(label="Use Pipeline", value=False)
|
95 |
+
use_tool = gr.Checkbox(label="Use Function Calling", value=False)
|
96 |
+
with gr.Column(visible=False) as tool_options:
|
97 |
+
tool_definition = gr.Code(
|
98 |
+
label="Tool Definition (JSON)",
|
99 |
+
value=customtool,
|
100 |
+
lines=15,
|
101 |
+
language="json"
|
102 |
+
# placeholder="Enter the JSON definition of your custom tool..."
|
103 |
+
)
|
104 |
+
|
105 |
+
def user(user_message, history):
|
106 |
+
return "", history + [[user_message, None]]
|
107 |
+
|
108 |
+
def bot(history, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition):
|
109 |
+
user_message = history[-1][0]
|
110 |
+
bot_message = generate_response(user_message, history, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition)
|
111 |
+
history[-1][1] = bot_message
|
112 |
+
return history
|
113 |
+
|
114 |
+
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
115 |
+
bot, [chatbot, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition], chatbot
|
116 |
+
)
|
117 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
118 |
+
|
119 |
+
use_tool.change(
|
120 |
+
fn=lambda x: gr.update(visible=x),
|
121 |
+
inputs=[use_tool],
|
122 |
+
outputs=[tool_options]
|
123 |
+
)
|
124 |
|
125 |
+
if __name__ == "__main__":
|
126 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
globe.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
title = """# 🙋🏻♂️Welcome to Tonic's 🤖 Nemotron-Mini-4B Demo 🚀"""
|
2 |
+
|
3 |
+
description = """Nemotron-Mini-4B-Instruct is a model for generating responses for roleplaying, retrieval augmented generation, and function calling. It is a small language model (SLM) optimized through distillation, pruning and quantization for speed and on-device deployment. It is a fine-tuned version of [nvidia/Minitron-4B-Base](https://huggingface.co/nvidia/Minitron-4B-Base), which was pruned and distilled from [Nemotron-4 15B](https://arxiv.org/abs/2402.16819) using [our LLM compression technique](https://arxiv.org/abs/2407.14679). This instruct model is optimized for roleplay, RAG QA, and function calling in English. It supports a context length of 4,096 tokens. This model is ready for commercial use.
|
4 |
+
|
5 |
+
Try this model on [build.nvidia.com](https://build.nvidia.com/nvidia/nemotron-mini-4b-instruct).
|
6 |
+
|
7 |
+
**Model Developer:** NVIDIA
|
8 |
+
|
9 |
+
**Model Dates:** Nemotron-Mini-4B-Instruct was trained between February 2024 and Aug 2024.
|
10 |
+
|
11 |
+
## License
|
12 |
+
|
13 |
+
[NVIDIA Community Model License](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct/blob/main/nvidia-community-model-license-aug2024.pdf)
|
14 |
+
|
15 |
+
## Model Architecture
|
16 |
+
|
17 |
+
Nemotron-Mini-4B-Instruct uses a model embedding size of 3072, 32 attention heads, and an MLP intermediate dimension of 9216. It also uses Grouped-Query Attention (GQA) and Rotary Position Embeddings (RoPE).
|
18 |
+
|
19 |
+
**Architecture Type:** Transformer Decoder (auto-regressive language model)
|
20 |
+
|
21 |
+
**Network Architecture:** Nemotron-4
|
22 |
+
|
23 |
+
"""
|
24 |
+
|
25 |
+
customtool = """{
|
26 |
+
"name": "custom_tool",
|
27 |
+
"description": "A custom tool defined by the user",
|
28 |
+
"parameters": {
|
29 |
+
"type": "object",
|
30 |
+
"properties": {
|
31 |
+
"param1": {
|
32 |
+
"type": "string",
|
33 |
+
"description": "First parameter of the custom tool"
|
34 |
+
},
|
35 |
+
"param2": {
|
36 |
+
"type": "string",
|
37 |
+
"description": "Second parameter of the custom tool"
|
38 |
+
}
|
39 |
+
},
|
40 |
+
"required": ["param1"]
|
41 |
+
}
|
42 |
+
}"""
|