Nemo-Mistral-Minitron

Running on Zero

App Files Files Community

Tonic commited on Sep 17

Commit

e10040f

•

1 Parent(s): 8979638

add tool call

Browse files

Files changed (2) hide show

app.py +123 -48
globe.py +42 -0

app.py CHANGED Viewed

@@ -1,51 +1,126 @@
-import spaces
-import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-title = """# 🙋🏻‍♂️ Welcome to Tonic's Minitron-8B-Base"""
-description = """
-Minitron is a family of small language models (SLMs) obtained by pruning [NVIDIA's](https://huggingface.co/nvidia) Nemotron-4 15B model. We prune model embedding size, attention heads, and MLP intermediate dimension, following which, we perform continued training with distillation to arrive at the final models.
-### Join us :
-🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [BuildTonic](https://github.com/buildtonic/)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
-"""
-# Load the tokenizer and model
-model_path = "nvidia/Minitron-8B-Base"
 tokenizer = AutoTokenizer.from_pretrained(model_path)
-device='cuda'
-dtype=torch.bfloat16
-model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype, device_map=device)
-# Define the prompt format
-def create_prompt(instruction):
-    PROMPT = '''Below is an instruction that describes a task.\n\nWrite a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:'''
-    return PROMPT.format(instruction=instruction)
-@spaces.GPU
-def respond(message, history, system_message, max_tokens, temperature, top_p):
-    prompt = create_prompt(message)
-    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
-    output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1)
-    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    return output_text
-demo = gr.ChatInterface(
-    title=gr.Markdown(title),
-#   description=gr.Markdown(description),
-    fn=respond,
-    additional_inputs=[
-        gr.Textbox(value="You are Minitron an AI assistant created by Tonic-AI", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import json
+from globe import title, description, customtool
+model_path = "nvidia/Nemotron-Mini-4B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path)
+# Create a pipeline
+pipe = pipeline("text-generation", model=model_path)
+pipe.tokenizer = tokenizer  # Assign tokenizer manually
+def create_prompt(system_message, user_message, tool_definition=""):
+    if tool_definition:
+        return f"""<extra_id_0>System
+{system_message}
+<tool>
+{tool_definition}
+</tool>
+<context>
+The current date is 2023-06-01.
+</context>
+<extra_id_1>User
+{user_message}
+<extra_id_1>Assistant
+"""
+    else:
+        return f"<extra_id_0>System\n{system_message}\n\n<extra_id_1>User\n{user_message}\n<extra_id_1>Assistant\n"
+def generate_response(message, history, system_message, max_tokens, temperature, top_p, use_pipeline=False, tool_definition=""):
+    full_prompt = create_prompt(system_message, message, tool_definition)
+    if use_pipeline:
+        messages = [
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": message},
+        ]
+        response = pipe(messages, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p)[0]['generated_text']
+    else:
+        tokenized_chat = tokenizer.apply_chat_template(
+            [
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": message},
+            ],
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        )
+        with torch.no_grad():
+            output_ids = model.generate(
+                tokenized_chat,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True
+            )
+        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    assistant_response = response.split("<extra_id_1>Assistant\n")[-1].strip()
+    if tool_definition and "<toolcall>" in assistant_response:
+        tool_call = assistant_response.split("<toolcall>")[1].split("</toolcall>")[0]
+        assistant_response += f"\n\nTool Call: {tool_call}\n\nNote: This is a simulated tool call. In a real scenario, the tool would be executed and its output would be used to generate a final response."
+    return assistant_response
+with gr.Blocks() as demo:
+    gr.Markdown("# 🤖 Nemotron-Mini-4B-Instruct Demo with Custom Function Calling")
+    gr.Markdown("This demo showcases the Nemotron-Mini-4B-Instruct model from NVIDIA, including optional custom function calling.")
+    with gr.Row():
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(height=400)
+            msg = gr.Textbox(label="User Input", placeholder="Ask a question or request a task...")
+            clear = gr.Button("Clear")
+        with gr.Column(scale=2):
+            system_message = gr.Textbox(
+                label="System Message",
+                value="You are a helpful AI assistant.",
+                lines=2,
+                placeholder="Set the AI's behavior and context..."
+            )
+            max_tokens = gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max Tokens")
+            temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
+            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
+            use_pipeline = gr.Checkbox(label="Use Pipeline", value=False)
+            use_tool = gr.Checkbox(label="Use Function Calling", value=False)
+            with gr.Column(visible=False) as tool_options:
+                tool_definition = gr.Code(
+                    label="Tool Definition (JSON)",
+                    value=customtool,
+                    lines=15,
+                    language="json"
+                    # placeholder="Enter the JSON definition of your custom tool..."
+                )
+    def user(user_message, history):
+        return "", history + [[user_message, None]]
+    def bot(history, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition):
+        user_message = history[-1][0]
+        bot_message = generate_response(user_message, history, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition)
+        history[-1][1] = bot_message
+        return history
+    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, [chatbot, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition], chatbot
+    )
+    clear.click(lambda: None, None, chatbot, queue=False)
+    use_tool.change(
+        fn=lambda x: gr.update(visible=x),
+        inputs=[use_tool],
+        outputs=[tool_options]
+    )
+if __name__ == "__main__":
+    demo.launch()

globe.py ADDED Viewed

	@@ -0,0 +1,42 @@

+title =  """# 🙋🏻‍♂️Welcome to Tonic's 🤖 Nemotron-Mini-4B Demo 🚀"""
+description = """Nemotron-Mini-4B-Instruct is a model for generating responses for roleplaying, retrieval augmented generation, and function calling.  It is a small language model (SLM) optimized through distillation, pruning and quantization for speed and on-device deployment. It is a fine-tuned version of [nvidia/Minitron-4B-Base](https://huggingface.co/nvidia/Minitron-4B-Base), which was pruned and distilled from [Nemotron-4 15B](https://arxiv.org/abs/2402.16819) using [our LLM compression technique](https://arxiv.org/abs/2407.14679). This instruct model is optimized for roleplay, RAG QA, and function calling in English. It supports a context length of 4,096 tokens. This model is ready for commercial use.
+Try this model on [build.nvidia.com](https://build.nvidia.com/nvidia/nemotron-mini-4b-instruct).
+**Model Developer:** NVIDIA
+**Model Dates:** Nemotron-Mini-4B-Instruct was trained between February 2024 and Aug 2024.
+## License
+[NVIDIA Community Model License](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct/blob/main/nvidia-community-model-license-aug2024.pdf)
+## Model Architecture
+Nemotron-Mini-4B-Instruct uses a model embedding size of 3072, 32 attention heads, and an MLP intermediate dimension of 9216. It also uses Grouped-Query Attention (GQA) and Rotary Position Embeddings (RoPE).
+**Architecture Type:** Transformer Decoder (auto-regressive language model)
+**Network Architecture:** Nemotron-4
+"""
+customtool = """{
+  "name": "custom_tool",
+  "description": "A custom tool defined by the user",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "param1": {
+        "type": "string",
+        "description": "First parameter of the custom tool"
+      },
+      "param2": {
+        "type": "string",
+        "description": "Second parameter of the custom tool"
+      }
+    },
+    "required": ["param1"]
+  }
+}"""