raspberry-3b

Running on Zero

File size: 4,152 Bytes

52da0b6
 
4afee77
52da0b6
 
 
 
4afee77
52da0b6
 
f72d17f
52da0b6
 
 
a685808
52da0b6
 
 
a685808
52da0b6
 
 
 
4afee77
52da0b6
 
 
 
 
 
 
 
 
 
 
4afee77
 
52da0b6
deb1a8e
52da0b6
 
 
e6ca87e
4afee77
52da0b6
 
 
 
f50c709
52da0b6
 
 
4afee77
52da0b6
 
 
 
 
 
4afee77
52da0b6
 
 
 
 
 
 
 
 
 
 
 
 
 
4afee77
52da0b6
 
 
 
4afee77
 
 
52da0b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a685808
 
 
 
52da0b6
 
 
4afee77

import torch
from PIL import Image
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import os
from threading import Thread


HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL_ID = "qnguyen3/raspberry-3B"
MODELS = os.environ.get("MODELS")
MODEL_NAME = MODELS.split("/")[-1]

TITLE = "<h1><center>raspberry-3b</center></h1>"

DESCRIPTION = f"""
<center>
<p>raspberry-3b
<br>
Feel free to test without log.
</p>
</center>
"""

CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
"""

model = AutoModelForCausalLM.from_pretrained(
          MODEL_ID,
          torch_dtype=torch.float16,
          device_map="auto",
        )
tokenizer = AutoTokenizer.from_pretrained('qnguyen3/WitchLM-1.5B')

@spaces.GPU
def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
    print(f'message is - {message}')
    print(f'history is - {history}')
    conversation = [{"role": "system", "content": 'You are a helpful assistant.'}]
    for prompt, answer in history:
        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
    conversation.append({"role": "user", "content": message})

    print(f"Conversation is -\n{conversation}")
    
    input_ids = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(input_ids, return_tensors="pt").to(0)
    
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)

    generate_kwargs = dict(
        inputs, 
        streamer=streamer,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=penalty,
        max_new_tokens=max_new_tokens, 
        do_sample=True, 
        temperature=temperature,
        eos_token_id = [151645, 151643],
    )
    
    thread = Thread(target=model.generate, kwargs=generate_kwargs)
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer



chatbot = gr.Chatbot(height=450)

with gr.Blocks(css=CSS) as demo:
    gr.HTML(TITLE)
    gr.HTML(DESCRIPTION)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.8,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=4096,
                step=1,
                value=1024,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=0.8,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=1.0,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=[
            ["Can you explain briefly to me what is the Python programming language?"],
            ["Explain the plot of Cinderella in a sentence."],
            ["How many hours does it take a man to eat a Helicopter?"],
            ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
        ],
        cache_examples=False,
    )


if __name__ == "__main__":
    demo.launch()