Spaces:

lhoestq
/

LLM.tool

Sleeping

File size: 2,641 Bytes

a7e1afa
 
b36ec81
0f05b02
 
 
b36ec81
0f05b02
b36ec81
 
 
 
ad6ff82
b36ec81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f05b02
 
f3dd017
8f3ed37
0f05b02
 
f3dd017
8f3ed37
85c95ca
0f05b02
 
05ecaec
8f3ed37
0f05b02
b36ec81
0f05b02

import json

import gradio as gr
import requests
import uvicorn
from fastapi import FastAPI
from huggingface_hub import InferenceClient
from starlette.responses import StreamingResponse, JSONResponse

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("microsoft/Phi-3-mini-4k-instruct")


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content

        response += token
        yield response

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


app = FastAPI()

@app.head("/ask")
def ask_head():
    return StreamingResponse("", media_type="application/json")

@app.get("/ask")
def ask_get(message: str = "", system_message: str = "You are a friendly Chatbot.", max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95):
    predict_response = requests.post('http://localhost:7860/call/chat', json={'data': [message, [], system_message, max_tokens, temperature, top_p]}).json()
    if "event_id" not in predict_response:
        return predict_response
    out = requests.get(f'http://localhost:7860/call/chat/{predict_response["event_id"]}').text
    return JSONResponse([json.loads(out.rsplit("event: complete\ndata: ", 1)[-1])[0].strip()])

if __name__ == "__main__":
    app = gr.mount_gradio_app(app, demo, path="/")
    uvicorn.run(app, host="0.0.0.0", port=7860)