Tonic commited on
Commit
e10040f
1 Parent(s): 8979638

add tool call

Browse files
Files changed (2) hide show
  1. app.py +123 -48
  2. globe.py +42 -0
app.py CHANGED
@@ -1,51 +1,126 @@
1
- import spaces
2
- import gradio as gr
3
- import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
-
6
- title = """# 🙋🏻‍♂️ Welcome to Tonic's Minitron-8B-Base"""
7
- description = """
8
- Minitron is a family of small language models (SLMs) obtained by pruning [NVIDIA's](https://huggingface.co/nvidia) Nemotron-4 15B model. We prune model embedding size, attention heads, and MLP intermediate dimension, following which, we perform continued training with distillation to arrive at the final models.
9
- ### Join us :
10
- 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [BuildTonic](https://github.com/buildtonic/)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
11
- """
12
 
13
- # Load the tokenizer and model
14
- model_path = "nvidia/Minitron-8B-Base"
15
  tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- device='cuda'
18
- dtype=torch.bfloat16
19
- model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype, device_map=device)
20
-
21
- # Define the prompt format
22
- def create_prompt(instruction):
23
- PROMPT = '''Below is an instruction that describes a task.\n\nWrite a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:'''
24
- return PROMPT.format(instruction=instruction)
25
-
26
- @spaces.GPU
27
- def respond(message, history, system_message, max_tokens, temperature, top_p):
28
- prompt = create_prompt(message)
29
-
30
- input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
31
-
32
- output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1)
33
-
34
- output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
35
-
36
- return output_text
37
-
38
- demo = gr.ChatInterface(
39
- title=gr.Markdown(title),
40
- # description=gr.Markdown(description),
41
- fn=respond,
42
- additional_inputs=[
43
- gr.Textbox(value="You are Minitron an AI assistant created by Tonic-AI", label="System message"),
44
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
45
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
46
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
47
- ],
48
- )
49
-
50
- if __name__ == "__main__":
51
- demo.launch()
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
+ import json
5
+ from globe import title, description, customtool
6
+
 
 
 
 
 
7
 
8
+ model_path = "nvidia/Nemotron-Mini-4B-Instruct"
 
9
  tokenizer = AutoTokenizer.from_pretrained(model_path)
10
+ model = AutoModelForCausalLM.from_pretrained(model_path)
11
+
12
+ # Create a pipeline
13
+ pipe = pipeline("text-generation", model=model_path)
14
+ pipe.tokenizer = tokenizer # Assign tokenizer manually
15
+
16
+ def create_prompt(system_message, user_message, tool_definition=""):
17
+ if tool_definition:
18
+ return f"""<extra_id_0>System
19
+ {system_message}
20
+
21
+ <tool>
22
+ {tool_definition}
23
+ </tool>
24
+ <context>
25
+ The current date is 2023-06-01.
26
+ </context>
27
+
28
+ <extra_id_1>User
29
+ {user_message}
30
+ <extra_id_1>Assistant
31
+ """
32
+ else:
33
+ return f"<extra_id_0>System\n{system_message}\n\n<extra_id_1>User\n{user_message}\n<extra_id_1>Assistant\n"
34
+
35
+ def generate_response(message, history, system_message, max_tokens, temperature, top_p, use_pipeline=False, tool_definition=""):
36
+ full_prompt = create_prompt(system_message, message, tool_definition)
37
+
38
+ if use_pipeline:
39
+ messages = [
40
+ {"role": "system", "content": system_message},
41
+ {"role": "user", "content": message},
42
+ ]
43
+ response = pipe(messages, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p)[0]['generated_text']
44
+ else:
45
+ tokenized_chat = tokenizer.apply_chat_template(
46
+ [
47
+ {"role": "system", "content": system_message},
48
+ {"role": "user", "content": message},
49
+ ],
50
+ tokenize=True,
51
+ add_generation_prompt=True,
52
+ return_tensors="pt"
53
+ )
54
+
55
+ with torch.no_grad():
56
+ output_ids = model.generate(
57
+ tokenized_chat,
58
+ max_new_tokens=max_tokens,
59
+ temperature=temperature,
60
+ top_p=top_p,
61
+ do_sample=True
62
+ )
63
+
64
+ response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
65
+
66
+ assistant_response = response.split("<extra_id_1>Assistant\n")[-1].strip()
67
+
68
+ if tool_definition and "<toolcall>" in assistant_response:
69
+ tool_call = assistant_response.split("<toolcall>")[1].split("</toolcall>")[0]
70
+ assistant_response += f"\n\nTool Call: {tool_call}\n\nNote: This is a simulated tool call. In a real scenario, the tool would be executed and its output would be used to generate a final response."
71
+
72
+ return assistant_response
73
+
74
+ with gr.Blocks() as demo:
75
+ gr.Markdown("# 🤖 Nemotron-Mini-4B-Instruct Demo with Custom Function Calling")
76
+ gr.Markdown("This demo showcases the Nemotron-Mini-4B-Instruct model from NVIDIA, including optional custom function calling.")
77
+
78
+ with gr.Row():
79
+ with gr.Column(scale=3):
80
+ chatbot = gr.Chatbot(height=400)
81
+ msg = gr.Textbox(label="User Input", placeholder="Ask a question or request a task...")
82
+ clear = gr.Button("Clear")
83
+
84
+ with gr.Column(scale=2):
85
+ system_message = gr.Textbox(
86
+ label="System Message",
87
+ value="You are a helpful AI assistant.",
88
+ lines=2,
89
+ placeholder="Set the AI's behavior and context..."
90
+ )
91
+ max_tokens = gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max Tokens")
92
+ temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
93
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
94
+ use_pipeline = gr.Checkbox(label="Use Pipeline", value=False)
95
+ use_tool = gr.Checkbox(label="Use Function Calling", value=False)
96
+ with gr.Column(visible=False) as tool_options:
97
+ tool_definition = gr.Code(
98
+ label="Tool Definition (JSON)",
99
+ value=customtool,
100
+ lines=15,
101
+ language="json"
102
+ # placeholder="Enter the JSON definition of your custom tool..."
103
+ )
104
+
105
+ def user(user_message, history):
106
+ return "", history + [[user_message, None]]
107
+
108
+ def bot(history, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition):
109
+ user_message = history[-1][0]
110
+ bot_message = generate_response(user_message, history, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition)
111
+ history[-1][1] = bot_message
112
+ return history
113
+
114
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
115
+ bot, [chatbot, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition], chatbot
116
+ )
117
+ clear.click(lambda: None, None, chatbot, queue=False)
118
+
119
+ use_tool.change(
120
+ fn=lambda x: gr.update(visible=x),
121
+ inputs=[use_tool],
122
+ outputs=[tool_options]
123
+ )
124
 
125
+ if __name__ == "__main__":
126
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
globe.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title = """# 🙋🏻‍♂️Welcome to Tonic's 🤖 Nemotron-Mini-4B Demo 🚀"""
2
+
3
+ description = """Nemotron-Mini-4B-Instruct is a model for generating responses for roleplaying, retrieval augmented generation, and function calling. It is a small language model (SLM) optimized through distillation, pruning and quantization for speed and on-device deployment. It is a fine-tuned version of [nvidia/Minitron-4B-Base](https://huggingface.co/nvidia/Minitron-4B-Base), which was pruned and distilled from [Nemotron-4 15B](https://arxiv.org/abs/2402.16819) using [our LLM compression technique](https://arxiv.org/abs/2407.14679). This instruct model is optimized for roleplay, RAG QA, and function calling in English. It supports a context length of 4,096 tokens. This model is ready for commercial use.
4
+
5
+ Try this model on [build.nvidia.com](https://build.nvidia.com/nvidia/nemotron-mini-4b-instruct).
6
+
7
+ **Model Developer:** NVIDIA
8
+
9
+ **Model Dates:** Nemotron-Mini-4B-Instruct was trained between February 2024 and Aug 2024.
10
+
11
+ ## License
12
+
13
+ [NVIDIA Community Model License](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct/blob/main/nvidia-community-model-license-aug2024.pdf)
14
+
15
+ ## Model Architecture
16
+
17
+ Nemotron-Mini-4B-Instruct uses a model embedding size of 3072, 32 attention heads, and an MLP intermediate dimension of 9216. It also uses Grouped-Query Attention (GQA) and Rotary Position Embeddings (RoPE).
18
+
19
+ **Architecture Type:** Transformer Decoder (auto-regressive language model)
20
+
21
+ **Network Architecture:** Nemotron-4
22
+
23
+ """
24
+
25
+ customtool = """{
26
+ "name": "custom_tool",
27
+ "description": "A custom tool defined by the user",
28
+ "parameters": {
29
+ "type": "object",
30
+ "properties": {
31
+ "param1": {
32
+ "type": "string",
33
+ "description": "First parameter of the custom tool"
34
+ },
35
+ "param2": {
36
+ "type": "string",
37
+ "description": "Second parameter of the custom tool"
38
+ }
39
+ },
40
+ "required": ["param1"]
41
+ }
42
+ }"""