Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,10 +2,6 @@ import gradio as gr
|
|
2 |
import spaces
|
3 |
import torch
|
4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
5 |
-
import subprocess
|
6 |
-
|
7 |
-
# Install flash-attn
|
8 |
-
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
9 |
|
10 |
# Load the model and tokenizer
|
11 |
model_name = "akjindal53244/Llama-3.1-Storm-8B"
|
@@ -13,7 +9,6 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
13 |
model = AutoModelForCausalLM.from_pretrained(
|
14 |
model_name,
|
15 |
torch_dtype=torch.bfloat16,
|
16 |
-
use_flash_attention_2=True,
|
17 |
device_map="auto"
|
18 |
)
|
19 |
|
@@ -25,7 +20,7 @@ def generate_text(prompt, max_length, temperature):
|
|
25 |
]
|
26 |
formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
27 |
|
28 |
-
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(
|
29 |
|
30 |
outputs = model.generate(
|
31 |
**inputs,
|
|
|
2 |
import spaces
|
3 |
import torch
|
4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Load the model and tokenizer
|
7 |
model_name = "akjindal53244/Llama-3.1-Storm-8B"
|
|
|
9 |
model = AutoModelForCausalLM.from_pretrained(
|
10 |
model_name,
|
11 |
torch_dtype=torch.bfloat16,
|
|
|
12 |
device_map="auto"
|
13 |
)
|
14 |
|
|
|
20 |
]
|
21 |
formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
22 |
|
23 |
+
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
|
24 |
|
25 |
outputs = model.generate(
|
26 |
**inputs,
|